| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.5714285714285714, | |
| "eval_steps": 500, | |
| "global_step": 500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "completion_length": 2477.2917098999023, | |
| "epoch": 0.001142857142857143, | |
| "grad_norm": 0.08594289422035217, | |
| "kl": 0.0, | |
| "learning_rate": 0.0, | |
| "loss": -0.0, | |
| "reward": 0.7812721505761147, | |
| "reward_std": 0.7786822463385761, | |
| "rewards/cosine_scaled_reward": 0.09896941180340946, | |
| "rewards/format_reward": 0.5833333358168602, | |
| "step": 1 | |
| }, | |
| { | |
| "completion_length": 2531.083351135254, | |
| "epoch": 0.002285714285714286, | |
| "grad_norm": 0.09191003441810608, | |
| "kl": 0.0, | |
| "learning_rate": 2e-08, | |
| "loss": -0.0, | |
| "reward": 0.5250445622950792, | |
| "reward_std": 0.6820136681199074, | |
| "rewards/cosine_scaled_reward": 0.033355601772200316, | |
| "rewards/format_reward": 0.4583333395421505, | |
| "step": 2 | |
| }, | |
| { | |
| "completion_length": 3001.583335876465, | |
| "epoch": 0.0034285714285714284, | |
| "grad_norm": 0.07898570597171783, | |
| "kl": 5.7190656661987305e-05, | |
| "learning_rate": 4e-08, | |
| "loss": 0.0, | |
| "reward": -0.08641314879059792, | |
| "reward_std": 0.6380448043346405, | |
| "rewards/cosine_scaled_reward": -0.16820657439529896, | |
| "rewards/format_reward": 0.2500000037252903, | |
| "step": 3 | |
| }, | |
| { | |
| "completion_length": 1352.4166946411133, | |
| "epoch": 0.004571428571428572, | |
| "grad_norm": 0.12036105990409851, | |
| "kl": 4.320591688156128e-05, | |
| "learning_rate": 6e-08, | |
| "loss": 0.0, | |
| "reward": 0.7256108671426773, | |
| "reward_std": 0.6638269834220409, | |
| "rewards/cosine_scaled_reward": -0.05386124923825264, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 4 | |
| }, | |
| { | |
| "completion_length": 3265.1458435058594, | |
| "epoch": 0.005714285714285714, | |
| "grad_norm": 0.07013845443725586, | |
| "kl": 5.491077899932861e-05, | |
| "learning_rate": 8e-08, | |
| "loss": 0.0, | |
| "reward": 0.13684228248894215, | |
| "reward_std": 0.6495966352522373, | |
| "rewards/cosine_scaled_reward": -0.07741219899617136, | |
| "rewards/format_reward": 0.2916666716337204, | |
| "step": 5 | |
| }, | |
| { | |
| "completion_length": 2714.7916870117188, | |
| "epoch": 0.006857142857142857, | |
| "grad_norm": 0.12832961976528168, | |
| "kl": 6.848573684692383e-05, | |
| "learning_rate": 1e-07, | |
| "loss": 0.0, | |
| "reward": 0.036636670120060444, | |
| "reward_std": 0.5894878190010786, | |
| "rewards/cosine_scaled_reward": -0.20043167704716325, | |
| "rewards/format_reward": 0.4375000074505806, | |
| "step": 6 | |
| }, | |
| { | |
| "completion_length": 2924.2083892822266, | |
| "epoch": 0.008, | |
| "grad_norm": 0.06487897038459778, | |
| "kl": 3.6388635635375977e-05, | |
| "learning_rate": 1.2e-07, | |
| "loss": 0.0, | |
| "reward": 0.5618981756269932, | |
| "reward_std": 0.8023515623062849, | |
| "rewards/cosine_scaled_reward": -0.021134261041879654, | |
| "rewards/format_reward": 0.6041666734963655, | |
| "step": 7 | |
| }, | |
| { | |
| "completion_length": 2468.8541946411133, | |
| "epoch": 0.009142857142857144, | |
| "grad_norm": 0.0644845962524414, | |
| "kl": 2.9161572456359863e-05, | |
| "learning_rate": 1.4e-07, | |
| "loss": 0.0, | |
| "reward": 0.680436591617763, | |
| "reward_std": 0.8313349261879921, | |
| "rewards/cosine_scaled_reward": 0.058968290220946074, | |
| "rewards/format_reward": 0.5625000055879354, | |
| "step": 8 | |
| }, | |
| { | |
| "completion_length": 2919.791679382324, | |
| "epoch": 0.010285714285714285, | |
| "grad_norm": 0.08689959347248077, | |
| "kl": 4.6640634536743164e-05, | |
| "learning_rate": 1.6e-07, | |
| "loss": 0.0, | |
| "reward": 0.11137600243091583, | |
| "reward_std": 0.5826327633112669, | |
| "rewards/cosine_scaled_reward": -0.10056201240513474, | |
| "rewards/format_reward": 0.3125000074505806, | |
| "step": 9 | |
| }, | |
| { | |
| "completion_length": 2591.250015258789, | |
| "epoch": 0.011428571428571429, | |
| "grad_norm": 0.08745718002319336, | |
| "kl": 3.638118505477905e-05, | |
| "learning_rate": 1.8e-07, | |
| "loss": 0.0, | |
| "reward": 0.2279951237142086, | |
| "reward_std": 0.8613783605396748, | |
| "rewards/cosine_scaled_reward": -0.11516911163926125, | |
| "rewards/format_reward": 0.45833333767950535, | |
| "step": 10 | |
| }, | |
| { | |
| "completion_length": 3401.7708740234375, | |
| "epoch": 0.012571428571428572, | |
| "grad_norm": 0.05986722558736801, | |
| "kl": 4.547834396362305e-05, | |
| "learning_rate": 2e-07, | |
| "loss": 0.0, | |
| "reward": -0.21977794915437698, | |
| "reward_std": 0.7277414798736572, | |
| "rewards/cosine_scaled_reward": -0.18280563806183636, | |
| "rewards/format_reward": 0.14583333395421505, | |
| "step": 11 | |
| }, | |
| { | |
| "completion_length": 2057.729202270508, | |
| "epoch": 0.013714285714285714, | |
| "grad_norm": 0.08447223156690598, | |
| "kl": 3.4344382584095e-05, | |
| "learning_rate": 2.1999999999999998e-07, | |
| "loss": 0.0, | |
| "reward": 0.5926045663654804, | |
| "reward_std": 0.7262464184314013, | |
| "rewards/cosine_scaled_reward": -0.037031049840152264, | |
| "rewards/format_reward": 0.666666679084301, | |
| "step": 12 | |
| }, | |
| { | |
| "completion_length": 2858.6458435058594, | |
| "epoch": 0.014857142857142857, | |
| "grad_norm": 0.07529250532388687, | |
| "kl": 4.3526291847229004e-05, | |
| "learning_rate": 2.4e-07, | |
| "loss": 0.0, | |
| "reward": 0.310442678630352, | |
| "reward_std": 0.6166130974888802, | |
| "rewards/cosine_scaled_reward": -0.0010286467149853706, | |
| "rewards/format_reward": 0.3125, | |
| "step": 13 | |
| }, | |
| { | |
| "completion_length": 2338.9583740234375, | |
| "epoch": 0.016, | |
| "grad_norm": 0.0749119371175766, | |
| "kl": 3.138929605484009e-05, | |
| "learning_rate": 2.6e-07, | |
| "loss": 0.0, | |
| "reward": 0.3668381357565522, | |
| "reward_std": 0.8442271370440722, | |
| "rewards/cosine_scaled_reward": -0.09783094096928835, | |
| "rewards/format_reward": 0.5625000037252903, | |
| "step": 14 | |
| }, | |
| { | |
| "completion_length": 2767.208335876465, | |
| "epoch": 0.017142857142857144, | |
| "grad_norm": 0.07692641764879227, | |
| "kl": 4.416704177856445e-05, | |
| "learning_rate": 2.8e-07, | |
| "loss": 0.0, | |
| "reward": 0.5312376767396927, | |
| "reward_std": 0.6873207837343216, | |
| "rewards/cosine_scaled_reward": 0.0572854932397604, | |
| "rewards/format_reward": 0.4166666716337204, | |
| "step": 15 | |
| }, | |
| { | |
| "completion_length": 3558.0416870117188, | |
| "epoch": 0.018285714285714287, | |
| "grad_norm": 0.05166986584663391, | |
| "kl": 5.1349401473999023e-05, | |
| "learning_rate": 3e-07, | |
| "loss": 0.0, | |
| "reward": -0.3318660706281662, | |
| "reward_std": 0.4215814145281911, | |
| "rewards/cosine_scaled_reward": -0.1867663636803627, | |
| "rewards/format_reward": 0.0416666679084301, | |
| "step": 16 | |
| }, | |
| { | |
| "completion_length": 2167.937545776367, | |
| "epoch": 0.019428571428571427, | |
| "grad_norm": 0.11954422295093536, | |
| "kl": 5.5149197578430176e-05, | |
| "learning_rate": 3.2e-07, | |
| "loss": 0.0, | |
| "reward": 0.6245296709239483, | |
| "reward_std": 0.6979014091193676, | |
| "rewards/cosine_scaled_reward": 0.010181492194533348, | |
| "rewards/format_reward": 0.6041666716337204, | |
| "step": 17 | |
| }, | |
| { | |
| "completion_length": 2954.958396911621, | |
| "epoch": 0.02057142857142857, | |
| "grad_norm": 0.05468250438570976, | |
| "kl": 3.195181488990784e-05, | |
| "learning_rate": 3.4000000000000003e-07, | |
| "loss": 0.0, | |
| "reward": 0.3992938678711653, | |
| "reward_std": 0.768631786108017, | |
| "rewards/cosine_scaled_reward": -0.008686407003551722, | |
| "rewards/format_reward": 0.4166666679084301, | |
| "step": 18 | |
| }, | |
| { | |
| "completion_length": 2836.812530517578, | |
| "epoch": 0.021714285714285714, | |
| "grad_norm": 0.06989579647779465, | |
| "kl": 4.445016384124756e-05, | |
| "learning_rate": 3.6e-07, | |
| "loss": 0.0, | |
| "reward": 1.026630605570972, | |
| "reward_std": 1.01700534299016, | |
| "rewards/cosine_scaled_reward": 0.2528986446559429, | |
| "rewards/format_reward": 0.5208333358168602, | |
| "step": 19 | |
| }, | |
| { | |
| "completion_length": 1982.8958587646484, | |
| "epoch": 0.022857142857142857, | |
| "grad_norm": 0.08062257617712021, | |
| "kl": 3.238394856452942e-05, | |
| "learning_rate": 3.7999999999999996e-07, | |
| "loss": 0.0, | |
| "reward": 0.6854300163686275, | |
| "reward_std": 0.6129952324554324, | |
| "rewards/cosine_scaled_reward": -0.011451659724116325, | |
| "rewards/format_reward": 0.708333333954215, | |
| "step": 20 | |
| }, | |
| { | |
| "completion_length": 2610.500011444092, | |
| "epoch": 0.024, | |
| "grad_norm": 0.1087082102894783, | |
| "kl": 6.441771984100342e-05, | |
| "learning_rate": 4e-07, | |
| "loss": 0.0, | |
| "reward": 0.23938602954149246, | |
| "reward_std": 0.7100841701030731, | |
| "rewards/cosine_scaled_reward": -0.057390330359339714, | |
| "rewards/format_reward": 0.35416666977107525, | |
| "step": 21 | |
| }, | |
| { | |
| "completion_length": 1582.0625648498535, | |
| "epoch": 0.025142857142857144, | |
| "grad_norm": 0.11413239687681198, | |
| "kl": 2.413429319858551e-05, | |
| "learning_rate": 4.1999999999999995e-07, | |
| "loss": 0.0, | |
| "reward": 0.7976827728562057, | |
| "reward_std": 0.5391496056690812, | |
| "rewards/cosine_scaled_reward": 0.0030080433934926987, | |
| "rewards/format_reward": 0.7916666679084301, | |
| "step": 22 | |
| }, | |
| { | |
| "completion_length": 2273.2708587646484, | |
| "epoch": 0.026285714285714287, | |
| "grad_norm": 0.09884999692440033, | |
| "kl": 3.8236379623413086e-05, | |
| "learning_rate": 4.3999999999999997e-07, | |
| "loss": 0.0, | |
| "reward": 0.645795910153538, | |
| "reward_std": 0.9579760804772377, | |
| "rewards/cosine_scaled_reward": 0.041647952515631914, | |
| "rewards/format_reward": 0.5625000074505806, | |
| "step": 23 | |
| }, | |
| { | |
| "completion_length": 2277.250045776367, | |
| "epoch": 0.027428571428571427, | |
| "grad_norm": 0.09460888057947159, | |
| "kl": 3.688409924507141e-05, | |
| "learning_rate": 4.6e-07, | |
| "loss": 0.0, | |
| "reward": 0.8067844077013433, | |
| "reward_std": 0.7267263159155846, | |
| "rewards/cosine_scaled_reward": 0.07005884870886803, | |
| "rewards/format_reward": 0.6666666772216558, | |
| "step": 24 | |
| }, | |
| { | |
| "completion_length": 2365.4375228881836, | |
| "epoch": 0.02857142857142857, | |
| "grad_norm": 0.07914211601018906, | |
| "kl": 4.9017369747161865e-05, | |
| "learning_rate": 4.8e-07, | |
| "loss": 0.0, | |
| "reward": 0.2558484375476837, | |
| "reward_std": 0.6883654966950417, | |
| "rewards/cosine_scaled_reward": -0.12207578588277102, | |
| "rewards/format_reward": 0.5, | |
| "step": 25 | |
| }, | |
| { | |
| "completion_length": 2899.8750534057617, | |
| "epoch": 0.029714285714285714, | |
| "grad_norm": 0.0661776065826416, | |
| "kl": 4.620850086212158e-05, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0, | |
| "reward": 0.2165616676211357, | |
| "reward_std": 0.6169561371207237, | |
| "rewards/cosine_scaled_reward": -0.13130250154063106, | |
| "rewards/format_reward": 0.47916667722165585, | |
| "step": 26 | |
| }, | |
| { | |
| "completion_length": 2911.375030517578, | |
| "epoch": 0.030857142857142857, | |
| "grad_norm": 0.08687944710254669, | |
| "kl": 5.9038400650024414e-05, | |
| "learning_rate": 5.2e-07, | |
| "loss": 0.0, | |
| "reward": 0.03436022438108921, | |
| "reward_std": 0.6152231078594923, | |
| "rewards/cosine_scaled_reward": -0.13906989246606827, | |
| "rewards/format_reward": 0.31250000558793545, | |
| "step": 27 | |
| }, | |
| { | |
| "completion_length": 2627.7292251586914, | |
| "epoch": 0.032, | |
| "grad_norm": 0.07916371524333954, | |
| "kl": 5.59687614440918e-05, | |
| "learning_rate": 5.4e-07, | |
| "loss": 0.0, | |
| "reward": 0.5622920994646847, | |
| "reward_std": 0.9138174392282963, | |
| "rewards/cosine_scaled_reward": 0.04156270687235519, | |
| "rewards/format_reward": 0.47916667349636555, | |
| "step": 28 | |
| }, | |
| { | |
| "completion_length": 3058.541717529297, | |
| "epoch": 0.03314285714285714, | |
| "grad_norm": 0.06868138164281845, | |
| "kl": 3.463029861450195e-05, | |
| "learning_rate": 5.6e-07, | |
| "loss": 0.0, | |
| "reward": -0.24681237153708935, | |
| "reward_std": 0.5282476097345352, | |
| "rewards/cosine_scaled_reward": -0.24840618949383497, | |
| "rewards/format_reward": 0.25000000558793545, | |
| "step": 29 | |
| }, | |
| { | |
| "completion_length": 2708.6250610351562, | |
| "epoch": 0.03428571428571429, | |
| "grad_norm": 0.0861496552824974, | |
| "kl": 4.427134990692139e-05, | |
| "learning_rate": 5.8e-07, | |
| "loss": 0.0, | |
| "reward": 0.6493414626456797, | |
| "reward_std": 1.0688889399170876, | |
| "rewards/cosine_scaled_reward": 0.06425405736081302, | |
| "rewards/format_reward": 0.5208333469927311, | |
| "step": 30 | |
| }, | |
| { | |
| "completion_length": 3106.854179382324, | |
| "epoch": 0.03542857142857143, | |
| "grad_norm": 0.07984768599271774, | |
| "kl": 5.18113374710083e-05, | |
| "learning_rate": 6e-07, | |
| "loss": 0.0, | |
| "reward": -0.21240638848394156, | |
| "reward_std": 0.567398814484477, | |
| "rewards/cosine_scaled_reward": -0.21036987099796534, | |
| "rewards/format_reward": 0.20833333395421505, | |
| "step": 31 | |
| }, | |
| { | |
| "completion_length": 2783.0416870117188, | |
| "epoch": 0.036571428571428574, | |
| "grad_norm": 0.07997466623783112, | |
| "kl": 5.587935447692871e-05, | |
| "learning_rate": 6.2e-07, | |
| "loss": 0.0, | |
| "reward": 0.4500444643199444, | |
| "reward_std": 0.9137975797057152, | |
| "rewards/cosine_scaled_reward": -0.004144447855651379, | |
| "rewards/format_reward": 0.4583333432674408, | |
| "step": 32 | |
| }, | |
| { | |
| "completion_length": 3032.437545776367, | |
| "epoch": 0.037714285714285714, | |
| "grad_norm": 0.07135379314422607, | |
| "kl": 4.722177982330322e-05, | |
| "learning_rate": 6.4e-07, | |
| "loss": 0.0, | |
| "reward": 0.15741109126247466, | |
| "reward_std": 0.7865993864834309, | |
| "rewards/cosine_scaled_reward": -0.14004445355385542, | |
| "rewards/format_reward": 0.4375000111758709, | |
| "step": 33 | |
| }, | |
| { | |
| "completion_length": 2224.1458740234375, | |
| "epoch": 0.038857142857142854, | |
| "grad_norm": 0.0843958705663681, | |
| "kl": 4.30806539952755e-05, | |
| "learning_rate": 6.6e-07, | |
| "loss": 0.0, | |
| "reward": 1.0836156494915485, | |
| "reward_std": 0.6480832640081644, | |
| "rewards/cosine_scaled_reward": 0.23972447961568832, | |
| "rewards/format_reward": 0.6041666716337204, | |
| "step": 34 | |
| }, | |
| { | |
| "completion_length": 3144.333354949951, | |
| "epoch": 0.04, | |
| "grad_norm": 0.10391873121261597, | |
| "kl": 5.939602851867676e-05, | |
| "learning_rate": 6.800000000000001e-07, | |
| "loss": 0.0, | |
| "reward": -0.06451615691184998, | |
| "reward_std": 0.6315669789910316, | |
| "rewards/cosine_scaled_reward": -0.15725808148272336, | |
| "rewards/format_reward": 0.2500000037252903, | |
| "step": 35 | |
| }, | |
| { | |
| "completion_length": 3455.4166870117188, | |
| "epoch": 0.04114285714285714, | |
| "grad_norm": 0.0721229612827301, | |
| "kl": 5.930662155151367e-05, | |
| "learning_rate": 7e-07, | |
| "loss": 0.0, | |
| "reward": -0.3214810397475958, | |
| "reward_std": 0.5321759339421988, | |
| "rewards/cosine_scaled_reward": -0.23365718219429255, | |
| "rewards/format_reward": 0.1458333358168602, | |
| "step": 36 | |
| }, | |
| { | |
| "completion_length": 3305.979217529297, | |
| "epoch": 0.04228571428571429, | |
| "grad_norm": 0.05366111919283867, | |
| "kl": 5.034357309341431e-05, | |
| "learning_rate": 7.2e-07, | |
| "loss": 0.0, | |
| "reward": -0.1500344700179994, | |
| "reward_std": 0.5052997134625912, | |
| "rewards/cosine_scaled_reward": -0.20001725386828184, | |
| "rewards/format_reward": 0.25000000558793545, | |
| "step": 37 | |
| }, | |
| { | |
| "completion_length": 3285.4375, | |
| "epoch": 0.04342857142857143, | |
| "grad_norm": 0.05576445534825325, | |
| "kl": 5.2601099014282227e-05, | |
| "learning_rate": 7.4e-07, | |
| "loss": 0.0, | |
| "reward": -0.0856419075280428, | |
| "reward_std": 0.5338072516024113, | |
| "rewards/cosine_scaled_reward": -0.11573762446641922, | |
| "rewards/format_reward": 0.14583333395421505, | |
| "step": 38 | |
| }, | |
| { | |
| "completion_length": 2866.583366394043, | |
| "epoch": 0.044571428571428574, | |
| "grad_norm": 0.07349838316440582, | |
| "kl": 3.793835639953613e-05, | |
| "learning_rate": 7.599999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 0.5124117014929652, | |
| "reward_std": 0.3881341894157231, | |
| "rewards/cosine_scaled_reward": 0.006205825367942452, | |
| "rewards/format_reward": 0.5, | |
| "step": 39 | |
| }, | |
| { | |
| "completion_length": 2531.416702270508, | |
| "epoch": 0.045714285714285714, | |
| "grad_norm": 0.0760888159275055, | |
| "kl": 3.7960708141326904e-05, | |
| "learning_rate": 7.799999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 0.2472677268087864, | |
| "reward_std": 0.5154618471860886, | |
| "rewards/cosine_scaled_reward": -0.12636615056544542, | |
| "rewards/format_reward": 0.5000000111758709, | |
| "step": 40 | |
| }, | |
| { | |
| "completion_length": 3004.9583740234375, | |
| "epoch": 0.046857142857142854, | |
| "grad_norm": 0.05898589268326759, | |
| "kl": 4.388391971588135e-05, | |
| "learning_rate": 8e-07, | |
| "loss": 0.0, | |
| "reward": -0.008114629774354398, | |
| "reward_std": 0.6303666643798351, | |
| "rewards/cosine_scaled_reward": -0.18114064447581768, | |
| "rewards/format_reward": 0.354166679084301, | |
| "step": 41 | |
| }, | |
| { | |
| "completion_length": 2882.1875, | |
| "epoch": 0.048, | |
| "grad_norm": 0.10115097463130951, | |
| "kl": 6.967782974243164e-05, | |
| "learning_rate": 8.199999999999999e-07, | |
| "loss": 0.0, | |
| "reward": -0.4181015230715275, | |
| "reward_std": 0.35371557623147964, | |
| "rewards/cosine_scaled_reward": -0.3340507596731186, | |
| "rewards/format_reward": 0.25, | |
| "step": 42 | |
| }, | |
| { | |
| "completion_length": 3062.4583740234375, | |
| "epoch": 0.04914285714285714, | |
| "grad_norm": 0.056427061557769775, | |
| "kl": 4.2632222175598145e-05, | |
| "learning_rate": 8.399999999999999e-07, | |
| "loss": 0.0, | |
| "reward": -0.019303516950458288, | |
| "reward_std": 0.5846860222518444, | |
| "rewards/cosine_scaled_reward": -0.12423509394284338, | |
| "rewards/format_reward": 0.22916666977107525, | |
| "step": 43 | |
| }, | |
| { | |
| "completion_length": 2561.145866394043, | |
| "epoch": 0.05028571428571429, | |
| "grad_norm": 0.10883927345275879, | |
| "kl": 3.974884748458862e-05, | |
| "learning_rate": 8.599999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 0.32845963537693024, | |
| "reward_std": 0.6221309639513493, | |
| "rewards/cosine_scaled_reward": -0.05452017858624458, | |
| "rewards/format_reward": 0.4375000074505806, | |
| "step": 44 | |
| }, | |
| { | |
| "completion_length": 3391.2708740234375, | |
| "epoch": 0.05142857142857143, | |
| "grad_norm": 0.06354381144046783, | |
| "kl": 4.704296588897705e-05, | |
| "learning_rate": 8.799999999999999e-07, | |
| "loss": 0.0, | |
| "reward": -0.012901787646114826, | |
| "reward_std": 0.6717023346573114, | |
| "rewards/cosine_scaled_reward": -0.14186757057905197, | |
| "rewards/format_reward": 0.27083333767950535, | |
| "step": 45 | |
| }, | |
| { | |
| "completion_length": 3130.854202270508, | |
| "epoch": 0.052571428571428575, | |
| "grad_norm": 0.06644223630428314, | |
| "kl": 4.620850086212158e-05, | |
| "learning_rate": 9e-07, | |
| "loss": 0.0, | |
| "reward": -0.12418931908905506, | |
| "reward_std": 0.5467363391071558, | |
| "rewards/cosine_scaled_reward": -0.17667799070477486, | |
| "rewards/format_reward": 0.2291666679084301, | |
| "step": 46 | |
| }, | |
| { | |
| "completion_length": 2594.312515258789, | |
| "epoch": 0.053714285714285714, | |
| "grad_norm": 0.2604289650917053, | |
| "kl": 6.350129842758179e-05, | |
| "learning_rate": 9.2e-07, | |
| "loss": 0.0, | |
| "reward": 0.23113940935581923, | |
| "reward_std": 0.9318405166268349, | |
| "rewards/cosine_scaled_reward": -0.09276363614480942, | |
| "rewards/format_reward": 0.4166666716337204, | |
| "step": 47 | |
| }, | |
| { | |
| "completion_length": 2569.7916946411133, | |
| "epoch": 0.054857142857142854, | |
| "grad_norm": 0.07575666159391403, | |
| "kl": 4.439055919647217e-05, | |
| "learning_rate": 9.399999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 0.4222676699864678, | |
| "reward_std": 0.9641231074929237, | |
| "rewards/cosine_scaled_reward": -0.028449506498873234, | |
| "rewards/format_reward": 0.47916667349636555, | |
| "step": 48 | |
| }, | |
| { | |
| "completion_length": 1905.270881652832, | |
| "epoch": 0.056, | |
| "grad_norm": 0.10799656808376312, | |
| "kl": 4.303455352783203e-05, | |
| "learning_rate": 9.6e-07, | |
| "loss": 0.0, | |
| "reward": 0.5841925805434585, | |
| "reward_std": 0.4271776806563139, | |
| "rewards/cosine_scaled_reward": -0.030820404179394245, | |
| "rewards/format_reward": 0.645833333954215, | |
| "step": 49 | |
| }, | |
| { | |
| "completion_length": 3055.333354949951, | |
| "epoch": 0.05714285714285714, | |
| "grad_norm": 0.06481879204511642, | |
| "kl": 3.377348184585571e-05, | |
| "learning_rate": 9.8e-07, | |
| "loss": 0.0, | |
| "reward": 0.3548060841858387, | |
| "reward_std": 0.6479707816615701, | |
| "rewards/cosine_scaled_reward": 0.021153047680854797, | |
| "rewards/format_reward": 0.31250000558793545, | |
| "step": 50 | |
| }, | |
| { | |
| "completion_length": 2422.8958587646484, | |
| "epoch": 0.05828571428571429, | |
| "grad_norm": 0.08640994131565094, | |
| "kl": 5.37186861038208e-05, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0, | |
| "reward": 0.2937164641916752, | |
| "reward_std": 0.5943415053188801, | |
| "rewards/cosine_scaled_reward": -0.0823084469884634, | |
| "rewards/format_reward": 0.4583333358168602, | |
| "step": 51 | |
| }, | |
| { | |
| "completion_length": 2708.2292098999023, | |
| "epoch": 0.05942857142857143, | |
| "grad_norm": 0.0866185650229454, | |
| "kl": 4.3101608753204346e-05, | |
| "learning_rate": 9.999890338174275e-07, | |
| "loss": 0.0, | |
| "reward": 0.7994312755763531, | |
| "reward_std": 0.9482753574848175, | |
| "rewards/cosine_scaled_reward": 0.1184656445402652, | |
| "rewards/format_reward": 0.5625000037252903, | |
| "step": 52 | |
| }, | |
| { | |
| "completion_length": 2714.229232788086, | |
| "epoch": 0.060571428571428575, | |
| "grad_norm": 0.07880908250808716, | |
| "kl": 4.388391971588135e-05, | |
| "learning_rate": 9.999561358041868e-07, | |
| "loss": 0.0, | |
| "reward": 0.5792883150279522, | |
| "reward_std": 1.1162610426545143, | |
| "rewards/cosine_scaled_reward": 0.039644140750169754, | |
| "rewards/format_reward": 0.5000000111758709, | |
| "step": 53 | |
| }, | |
| { | |
| "completion_length": 2341.9583740234375, | |
| "epoch": 0.061714285714285715, | |
| "grad_norm": 0.10119149833917618, | |
| "kl": 4.163850098848343e-05, | |
| "learning_rate": 9.999013075636804e-07, | |
| "loss": 0.0, | |
| "reward": 0.9983950331807137, | |
| "reward_std": 0.7757038623094559, | |
| "rewards/cosine_scaled_reward": 0.16586415050551295, | |
| "rewards/format_reward": 0.6666666716337204, | |
| "step": 54 | |
| }, | |
| { | |
| "completion_length": 2911.3333435058594, | |
| "epoch": 0.06285714285714286, | |
| "grad_norm": 0.06214113533496857, | |
| "kl": 3.289803862571716e-05, | |
| "learning_rate": 9.998245517681593e-07, | |
| "loss": 0.0, | |
| "reward": 0.5057209208607674, | |
| "reward_std": 0.8223349414765835, | |
| "rewards/cosine_scaled_reward": 0.02369379624724388, | |
| "rewards/format_reward": 0.45833333767950535, | |
| "step": 55 | |
| }, | |
| { | |
| "completion_length": 2966.2292098999023, | |
| "epoch": 0.064, | |
| "grad_norm": 0.0655374601483345, | |
| "kl": 4.0337443351745605e-05, | |
| "learning_rate": 9.997258721585931e-07, | |
| "loss": 0.0, | |
| "reward": 0.1729863523505628, | |
| "reward_std": 0.6948586534708738, | |
| "rewards/cosine_scaled_reward": -0.12184017524123192, | |
| "rewards/format_reward": 0.4166666716337204, | |
| "step": 56 | |
| }, | |
| { | |
| "completion_length": 3141.1041870117188, | |
| "epoch": 0.06514285714285714, | |
| "grad_norm": 0.049183689057826996, | |
| "kl": 3.1888484954833984e-05, | |
| "learning_rate": 9.996052735444862e-07, | |
| "loss": 0.0, | |
| "reward": 0.03709188476204872, | |
| "reward_std": 0.6826784070581198, | |
| "rewards/cosine_scaled_reward": -0.16895407100673765, | |
| "rewards/format_reward": 0.37500000186264515, | |
| "step": 57 | |
| }, | |
| { | |
| "completion_length": 2167.9167251586914, | |
| "epoch": 0.06628571428571428, | |
| "grad_norm": 0.09783437103033066, | |
| "kl": 3.758072853088379e-05, | |
| "learning_rate": 9.994627618036452e-07, | |
| "loss": 0.0, | |
| "reward": 0.691959026909899, | |
| "reward_std": 0.6940444465726614, | |
| "rewards/cosine_scaled_reward": 0.01264617033302784, | |
| "rewards/format_reward": 0.6666666734963655, | |
| "step": 58 | |
| }, | |
| { | |
| "completion_length": 2829.1666870117188, | |
| "epoch": 0.06742857142857143, | |
| "grad_norm": 0.07547919452190399, | |
| "kl": 3.076065331697464e-05, | |
| "learning_rate": 9.992983438818915e-07, | |
| "loss": 0.0, | |
| "reward": -0.14313821494579315, | |
| "reward_std": 0.5542087014764547, | |
| "rewards/cosine_scaled_reward": -0.22781910531921312, | |
| "rewards/format_reward": 0.3125, | |
| "step": 59 | |
| }, | |
| { | |
| "completion_length": 2783.0000381469727, | |
| "epoch": 0.06857142857142857, | |
| "grad_norm": 0.07219178974628448, | |
| "kl": 4.0277838706970215e-05, | |
| "learning_rate": 9.991120277927223e-07, | |
| "loss": 0.0, | |
| "reward": 0.3322446085512638, | |
| "reward_std": 0.8663923796266317, | |
| "rewards/cosine_scaled_reward": -0.07346104085445404, | |
| "rewards/format_reward": 0.47916666977107525, | |
| "step": 60 | |
| }, | |
| { | |
| "completion_length": 2816.0625610351562, | |
| "epoch": 0.06971428571428571, | |
| "grad_norm": 0.056101903319358826, | |
| "kl": 3.6172568798065186e-05, | |
| "learning_rate": 9.989038226169207e-07, | |
| "loss": 0.0, | |
| "reward": 0.5485197044909, | |
| "reward_std": 0.7189056314527988, | |
| "rewards/cosine_scaled_reward": 0.013843156397342682, | |
| "rewards/format_reward": 0.5208333507180214, | |
| "step": 61 | |
| }, | |
| { | |
| "completion_length": 2418.9792251586914, | |
| "epoch": 0.07085714285714285, | |
| "grad_norm": 0.07655443251132965, | |
| "kl": 2.9717572033405304e-05, | |
| "learning_rate": 9.98673738502114e-07, | |
| "loss": 0.0, | |
| "reward": 0.6249394756741822, | |
| "reward_std": 0.7466630786657333, | |
| "rewards/cosine_scaled_reward": 0.04163640830665827, | |
| "rewards/format_reward": 0.5416666697710752, | |
| "step": 62 | |
| }, | |
| { | |
| "completion_length": 1990.1875305175781, | |
| "epoch": 0.072, | |
| "grad_norm": 0.8183447122573853, | |
| "kl": 3.591179847717285e-05, | |
| "learning_rate": 9.98421786662277e-07, | |
| "loss": 0.0, | |
| "reward": 0.7574019301682711, | |
| "reward_std": 0.7115380149334669, | |
| "rewards/cosine_scaled_reward": 0.003700951114296913, | |
| "rewards/format_reward": 0.7500000074505806, | |
| "step": 63 | |
| }, | |
| { | |
| "completion_length": 2794.041702270508, | |
| "epoch": 0.07314285714285715, | |
| "grad_norm": 0.07740171253681183, | |
| "kl": 4.895031452178955e-05, | |
| "learning_rate": 9.981479793771866e-07, | |
| "loss": 0.0, | |
| "reward": 0.2606558855623007, | |
| "reward_std": 0.6461843717843294, | |
| "rewards/cosine_scaled_reward": -0.06758872466161847, | |
| "rewards/format_reward": 0.3958333395421505, | |
| "step": 64 | |
| }, | |
| { | |
| "completion_length": 2737.583366394043, | |
| "epoch": 0.07428571428571429, | |
| "grad_norm": 0.07800371199846268, | |
| "kl": 3.081047907471657e-05, | |
| "learning_rate": 9.97852329991824e-07, | |
| "loss": 0.0, | |
| "reward": 0.27562767267227173, | |
| "reward_std": 0.48519637808203697, | |
| "rewards/cosine_scaled_reward": -0.08093616738915443, | |
| "rewards/format_reward": 0.43750000558793545, | |
| "step": 65 | |
| }, | |
| { | |
| "completion_length": 2020.437515258789, | |
| "epoch": 0.07542857142857143, | |
| "grad_norm": 0.13267271220684052, | |
| "kl": 3.864150494337082e-05, | |
| "learning_rate": 9.975348529157229e-07, | |
| "loss": 0.0, | |
| "reward": 0.363989882171154, | |
| "reward_std": 0.8412193078547716, | |
| "rewards/cosine_scaled_reward": -0.06800505332648754, | |
| "rewards/format_reward": 0.5000000055879354, | |
| "step": 66 | |
| }, | |
| { | |
| "completion_length": 3430.5416870117188, | |
| "epoch": 0.07657142857142857, | |
| "grad_norm": 0.04681139066815376, | |
| "kl": 3.781914710998535e-05, | |
| "learning_rate": 9.971955636222684e-07, | |
| "loss": 0.0, | |
| "reward": -0.4500653734430671, | |
| "reward_std": 0.35858835093677044, | |
| "rewards/cosine_scaled_reward": -0.2666993495076895, | |
| "rewards/format_reward": 0.0833333358168602, | |
| "step": 67 | |
| }, | |
| { | |
| "completion_length": 1721.5000228881836, | |
| "epoch": 0.07771428571428571, | |
| "grad_norm": 0.10184609889984131, | |
| "kl": 3.907829523086548e-05, | |
| "learning_rate": 9.968344786479415e-07, | |
| "loss": 0.0, | |
| "reward": 0.803021315485239, | |
| "reward_std": 0.7033775001764297, | |
| "rewards/cosine_scaled_reward": 0.036927306558936834, | |
| "rewards/format_reward": 0.7291666716337204, | |
| "step": 68 | |
| }, | |
| { | |
| "completion_length": 2023.9375228881836, | |
| "epoch": 0.07885714285714286, | |
| "grad_norm": 0.09549941122531891, | |
| "kl": 5.4717063903808594e-05, | |
| "learning_rate": 9.964516155915151e-07, | |
| "loss": 0.0, | |
| "reward": 0.1851830668747425, | |
| "reward_std": 0.38896309956908226, | |
| "rewards/cosine_scaled_reward": -0.20949181518517435, | |
| "rewards/format_reward": 0.6041666716337204, | |
| "step": 69 | |
| }, | |
| { | |
| "completion_length": 2992.2083435058594, | |
| "epoch": 0.08, | |
| "grad_norm": 0.06805767118930817, | |
| "kl": 4.3198466300964355e-05, | |
| "learning_rate": 9.960469931131936e-07, | |
| "loss": 0.0, | |
| "reward": 0.13163810409605503, | |
| "reward_std": 0.7089879354462028, | |
| "rewards/cosine_scaled_reward": -0.13209761306643486, | |
| "rewards/format_reward": 0.3958333432674408, | |
| "step": 70 | |
| }, | |
| { | |
| "completion_length": 2657.9791717529297, | |
| "epoch": 0.08114285714285714, | |
| "grad_norm": 0.09039611369371414, | |
| "kl": 4.0724873542785645e-05, | |
| "learning_rate": 9.956206309337066e-07, | |
| "loss": 0.0, | |
| "reward": 0.40160089544951916, | |
| "reward_std": 0.632799606770277, | |
| "rewards/cosine_scaled_reward": 0.013300436548888683, | |
| "rewards/format_reward": 0.375, | |
| "step": 71 | |
| }, | |
| { | |
| "completion_length": 2680.312515258789, | |
| "epoch": 0.08228571428571428, | |
| "grad_norm": 0.09396609663963318, | |
| "kl": 4.735589027404785e-05, | |
| "learning_rate": 9.951725498333448e-07, | |
| "loss": 0.0, | |
| "reward": 0.17568538337945938, | |
| "reward_std": 0.5999180413782597, | |
| "rewards/cosine_scaled_reward": -0.16215731669217348, | |
| "rewards/format_reward": 0.5000000037252903, | |
| "step": 72 | |
| }, | |
| { | |
| "completion_length": 3368.6041870117188, | |
| "epoch": 0.08342857142857144, | |
| "grad_norm": 0.052425187081098557, | |
| "kl": 4.410743713378906e-05, | |
| "learning_rate": 9.947027716509488e-07, | |
| "loss": 0.0, | |
| "reward": 0.17505339160561562, | |
| "reward_std": 0.7316120192408562, | |
| "rewards/cosine_scaled_reward": -0.03747331537306309, | |
| "rewards/format_reward": 0.2500000074505806, | |
| "step": 73 | |
| }, | |
| { | |
| "completion_length": 2401.645866394043, | |
| "epoch": 0.08457142857142858, | |
| "grad_norm": 0.08559257537126541, | |
| "kl": 3.143027424812317e-05, | |
| "learning_rate": 9.942113192828444e-07, | |
| "loss": 0.0, | |
| "reward": 0.633028719574213, | |
| "reward_std": 0.7289652414619923, | |
| "rewards/cosine_scaled_reward": 0.06651434814557433, | |
| "rewards/format_reward": 0.5000000093132257, | |
| "step": 74 | |
| }, | |
| { | |
| "completion_length": 2861.166717529297, | |
| "epoch": 0.08571428571428572, | |
| "grad_norm": 0.04925947263836861, | |
| "kl": 3.231316804885864e-05, | |
| "learning_rate": 9.93698216681727e-07, | |
| "loss": 0.0, | |
| "reward": 0.4444448696449399, | |
| "reward_std": 0.5791845098137856, | |
| "rewards/cosine_scaled_reward": -0.0069442447274923325, | |
| "rewards/format_reward": 0.45833334513008595, | |
| "step": 75 | |
| }, | |
| { | |
| "completion_length": 2688.395866394043, | |
| "epoch": 0.08685714285714285, | |
| "grad_norm": 0.08172833174467087, | |
| "kl": 3.68654727935791e-05, | |
| "learning_rate": 9.931634888554935e-07, | |
| "loss": 0.0, | |
| "reward": -0.06408658530563116, | |
| "reward_std": 0.33793799951672554, | |
| "rewards/cosine_scaled_reward": -0.2403766242787242, | |
| "rewards/format_reward": 0.4166666679084301, | |
| "step": 76 | |
| }, | |
| { | |
| "completion_length": 2887.5000381469727, | |
| "epoch": 0.088, | |
| "grad_norm": 0.06948503851890564, | |
| "kl": 5.298107862472534e-05, | |
| "learning_rate": 9.926071618660237e-07, | |
| "loss": 0.0, | |
| "reward": -0.029417846351861954, | |
| "reward_std": 0.4597672317177057, | |
| "rewards/cosine_scaled_reward": -0.18137559248134494, | |
| "rewards/format_reward": 0.3333333358168602, | |
| "step": 77 | |
| }, | |
| { | |
| "completion_length": 3311.250045776367, | |
| "epoch": 0.08914285714285715, | |
| "grad_norm": 0.05178945139050484, | |
| "kl": 3.871321678161621e-05, | |
| "learning_rate": 9.9202926282791e-07, | |
| "loss": 0.0, | |
| "reward": -0.007005671039223671, | |
| "reward_std": 0.7657842040061951, | |
| "rewards/cosine_scaled_reward": -0.10766951239202172, | |
| "rewards/format_reward": 0.20833334140479565, | |
| "step": 78 | |
| }, | |
| { | |
| "completion_length": 2193.5000076293945, | |
| "epoch": 0.09028571428571429, | |
| "grad_norm": 0.07584657520055771, | |
| "kl": 3.458559513092041e-05, | |
| "learning_rate": 9.91429819907136e-07, | |
| "loss": 0.0, | |
| "reward": 0.42460418306291103, | |
| "reward_std": 0.5317995389923453, | |
| "rewards/cosine_scaled_reward": -0.07936457544565201, | |
| "rewards/format_reward": 0.5833333358168602, | |
| "step": 79 | |
| }, | |
| { | |
| "completion_length": 3340.2291870117188, | |
| "epoch": 0.09142857142857143, | |
| "grad_norm": 0.05549027398228645, | |
| "kl": 5.620718002319336e-05, | |
| "learning_rate": 9.908088623197048e-07, | |
| "loss": 0.0, | |
| "reward": -0.22576706111431122, | |
| "reward_std": 0.5958872698247433, | |
| "rewards/cosine_scaled_reward": -0.23788353614509106, | |
| "rewards/format_reward": 0.25000000186264515, | |
| "step": 80 | |
| }, | |
| { | |
| "completion_length": 3031.8333435058594, | |
| "epoch": 0.09257142857142857, | |
| "grad_norm": 0.08398247510194778, | |
| "kl": 5.17209991812706e-05, | |
| "learning_rate": 9.901664203302124e-07, | |
| "loss": 0.0, | |
| "reward": -0.16735144145786762, | |
| "reward_std": 0.4632125534117222, | |
| "rewards/cosine_scaled_reward": -0.20867571234703064, | |
| "rewards/format_reward": 0.25000000186264515, | |
| "step": 81 | |
| }, | |
| { | |
| "completion_length": 2600.541702270508, | |
| "epoch": 0.09371428571428571, | |
| "grad_norm": 0.07950612157583237, | |
| "kl": 4.464387893676758e-05, | |
| "learning_rate": 9.895025252503755e-07, | |
| "loss": 0.0, | |
| "reward": 0.26828794181346893, | |
| "reward_std": 0.8547205068171024, | |
| "rewards/cosine_scaled_reward": -0.10543935932219028, | |
| "rewards/format_reward": 0.4791666753590107, | |
| "step": 82 | |
| }, | |
| { | |
| "completion_length": 2600.3333740234375, | |
| "epoch": 0.09485714285714286, | |
| "grad_norm": 0.09987322241067886, | |
| "kl": 5.161762237548828e-05, | |
| "learning_rate": 9.888172094375033e-07, | |
| "loss": 0.0, | |
| "reward": 0.4237690633162856, | |
| "reward_std": 0.6998065821826458, | |
| "rewards/cosine_scaled_reward": 0.013967860024422407, | |
| "rewards/format_reward": 0.3958333395421505, | |
| "step": 83 | |
| }, | |
| { | |
| "completion_length": 2754.7292098999023, | |
| "epoch": 0.096, | |
| "grad_norm": 0.09407833963632584, | |
| "kl": 3.902614116668701e-05, | |
| "learning_rate": 9.881105062929221e-07, | |
| "loss": 0.0, | |
| "reward": 0.49883759021759033, | |
| "reward_std": 1.035055298358202, | |
| "rewards/cosine_scaled_reward": -0.0005812053568661213, | |
| "rewards/format_reward": 0.5000000093132257, | |
| "step": 84 | |
| }, | |
| { | |
| "completion_length": 3129.1875610351562, | |
| "epoch": 0.09714285714285714, | |
| "grad_norm": 0.0496111661195755, | |
| "kl": 3.4864991903305054e-05, | |
| "learning_rate": 9.873824502603459e-07, | |
| "loss": 0.0, | |
| "reward": 0.24320119991898537, | |
| "reward_std": 0.9415012821555138, | |
| "rewards/cosine_scaled_reward": -0.07631607260555029, | |
| "rewards/format_reward": 0.39583334140479565, | |
| "step": 85 | |
| }, | |
| { | |
| "completion_length": 2761.9583587646484, | |
| "epoch": 0.09828571428571428, | |
| "grad_norm": 0.08504597842693329, | |
| "kl": 5.344301462173462e-05, | |
| "learning_rate": 9.866330768241983e-07, | |
| "loss": 0.0, | |
| "reward": 0.0760206263512373, | |
| "reward_std": 0.5459963586181402, | |
| "rewards/cosine_scaled_reward": -0.19115635566413403, | |
| "rewards/format_reward": 0.4583333358168602, | |
| "step": 86 | |
| }, | |
| { | |
| "completion_length": 2701.4167404174805, | |
| "epoch": 0.09942857142857142, | |
| "grad_norm": 0.09614969789981842, | |
| "kl": 4.2945146560668945e-05, | |
| "learning_rate": 9.85862422507884e-07, | |
| "loss": 0.0, | |
| "reward": 0.4867006968706846, | |
| "reward_std": 0.7119872663170099, | |
| "rewards/cosine_scaled_reward": 0.003767012618482113, | |
| "rewards/format_reward": 0.47916667722165585, | |
| "step": 87 | |
| }, | |
| { | |
| "completion_length": 2059.1042098999023, | |
| "epoch": 0.10057142857142858, | |
| "grad_norm": 0.09717044234275818, | |
| "kl": 4.3779611587524414e-05, | |
| "learning_rate": 9.850705248720068e-07, | |
| "loss": 0.0, | |
| "reward": 0.6710593365132809, | |
| "reward_std": 0.7761991564184427, | |
| "rewards/cosine_scaled_reward": 0.02302964823320508, | |
| "rewards/format_reward": 0.6250000018626451, | |
| "step": 88 | |
| }, | |
| { | |
| "completion_length": 2918.9583740234375, | |
| "epoch": 0.10171428571428572, | |
| "grad_norm": 0.06425134837627411, | |
| "kl": 3.78340482711792e-05, | |
| "learning_rate": 9.8425742251254e-07, | |
| "loss": 0.0, | |
| "reward": 0.19752277620136738, | |
| "reward_std": 0.9163094125688076, | |
| "rewards/cosine_scaled_reward": -0.10957194399088621, | |
| "rewards/format_reward": 0.41666667349636555, | |
| "step": 89 | |
| }, | |
| { | |
| "completion_length": 2300.083335876465, | |
| "epoch": 0.10285714285714286, | |
| "grad_norm": 0.12959975004196167, | |
| "kl": 5.7190656661987305e-05, | |
| "learning_rate": 9.83423155058946e-07, | |
| "loss": 0.0, | |
| "reward": 0.18251994997262955, | |
| "reward_std": 0.5502962041646242, | |
| "rewards/cosine_scaled_reward": -0.1587400329299271, | |
| "rewards/format_reward": 0.5000000055879354, | |
| "step": 90 | |
| }, | |
| { | |
| "completion_length": 2872.875030517578, | |
| "epoch": 0.104, | |
| "grad_norm": 0.07099516689777374, | |
| "kl": 4.890561103820801e-05, | |
| "learning_rate": 9.825677631722435e-07, | |
| "loss": 0.0, | |
| "reward": 0.17524861101992428, | |
| "reward_std": 0.6752717737108469, | |
| "rewards/cosine_scaled_reward": -0.07904237881302834, | |
| "rewards/format_reward": 0.33333334140479565, | |
| "step": 91 | |
| }, | |
| { | |
| "completion_length": 2566.6666717529297, | |
| "epoch": 0.10514285714285715, | |
| "grad_norm": 0.0792502760887146, | |
| "kl": 4.4971704483032227e-05, | |
| "learning_rate": 9.816912885430258e-07, | |
| "loss": 0.0, | |
| "reward": 0.13229679688811302, | |
| "reward_std": 0.4320233128964901, | |
| "rewards/cosine_scaled_reward": -0.1526016043499112, | |
| "rewards/format_reward": 0.43750000186264515, | |
| "step": 92 | |
| }, | |
| { | |
| "completion_length": 3584.0, | |
| "epoch": 0.10628571428571429, | |
| "grad_norm": 0.051544271409511566, | |
| "kl": 5.6415796279907227e-05, | |
| "learning_rate": 9.807937738894303e-07, | |
| "loss": 0.0, | |
| "reward": -0.4429354555904865, | |
| "reward_std": 0.3659748025238514, | |
| "rewards/cosine_scaled_reward": -0.22146772593259811, | |
| "rewards/format_reward": 0.0, | |
| "step": 93 | |
| }, | |
| { | |
| "completion_length": 2366.6458740234375, | |
| "epoch": 0.10742857142857143, | |
| "grad_norm": 0.09649702161550522, | |
| "kl": 6.125122308731079e-05, | |
| "learning_rate": 9.798752629550546e-07, | |
| "loss": 0.0, | |
| "reward": 0.37258910946547985, | |
| "reward_std": 0.49216968566179276, | |
| "rewards/cosine_scaled_reward": -0.08453879505395889, | |
| "rewards/format_reward": 0.5416666679084301, | |
| "step": 94 | |
| }, | |
| { | |
| "completion_length": 3359.541717529297, | |
| "epoch": 0.10857142857142857, | |
| "grad_norm": 0.05108783394098282, | |
| "kl": 4.139542579650879e-05, | |
| "learning_rate": 9.78935800506826e-07, | |
| "loss": 0.0, | |
| "reward": -0.045478057116270065, | |
| "reward_std": 0.8096121177077293, | |
| "rewards/cosine_scaled_reward": -0.13732236111536622, | |
| "rewards/format_reward": 0.2291666679084301, | |
| "step": 95 | |
| }, | |
| { | |
| "completion_length": 2446.0208587646484, | |
| "epoch": 0.10971428571428571, | |
| "grad_norm": 0.08617669343948364, | |
| "kl": 3.606826066970825e-05, | |
| "learning_rate": 9.779754323328192e-07, | |
| "loss": 0.0, | |
| "reward": 0.2514612413942814, | |
| "reward_std": 0.7137120559345931, | |
| "rewards/cosine_scaled_reward": -0.11385272908955812, | |
| "rewards/format_reward": 0.4791666753590107, | |
| "step": 96 | |
| }, | |
| { | |
| "completion_length": 2909.479217529297, | |
| "epoch": 0.11085714285714286, | |
| "grad_norm": 0.06753314286470413, | |
| "kl": 5.036592483520508e-05, | |
| "learning_rate": 9.769942052400235e-07, | |
| "loss": 0.0, | |
| "reward": 0.12703711912035942, | |
| "reward_std": 0.6432105712592602, | |
| "rewards/cosine_scaled_reward": -0.12398143857717514, | |
| "rewards/format_reward": 0.37500000931322575, | |
| "step": 97 | |
| }, | |
| { | |
| "completion_length": 2753.2916946411133, | |
| "epoch": 0.112, | |
| "grad_norm": 0.08222991228103638, | |
| "kl": 3.345683217048645e-05, | |
| "learning_rate": 9.759921670520634e-07, | |
| "loss": 0.0, | |
| "reward": 0.25094920583069324, | |
| "reward_std": 0.5600340981036425, | |
| "rewards/cosine_scaled_reward": -0.062025411054492, | |
| "rewards/format_reward": 0.37500000558793545, | |
| "step": 98 | |
| }, | |
| { | |
| "completion_length": 2800.9583587646484, | |
| "epoch": 0.11314285714285714, | |
| "grad_norm": 0.1139119416475296, | |
| "kl": 4.9620866775512695e-05, | |
| "learning_rate": 9.749693666068663e-07, | |
| "loss": 0.0, | |
| "reward": -0.010324659757316113, | |
| "reward_std": 0.6009259968996048, | |
| "rewards/cosine_scaled_reward": -0.15099567361176014, | |
| "rewards/format_reward": 0.2916666679084301, | |
| "step": 99 | |
| }, | |
| { | |
| "completion_length": 2640.916717529297, | |
| "epoch": 0.11428571428571428, | |
| "grad_norm": 0.08766157180070877, | |
| "kl": 4.605203866958618e-05, | |
| "learning_rate": 9.739258537542835e-07, | |
| "loss": 0.0, | |
| "reward": 0.3726149722933769, | |
| "reward_std": 0.9378994293510914, | |
| "rewards/cosine_scaled_reward": -0.03244251757860184, | |
| "rewards/format_reward": 0.4375000037252903, | |
| "step": 100 | |
| }, | |
| { | |
| "completion_length": 2739.562545776367, | |
| "epoch": 0.11542857142857142, | |
| "grad_norm": 0.07954414933919907, | |
| "kl": 5.1237642765045166e-05, | |
| "learning_rate": 9.728616793536587e-07, | |
| "loss": 0.0, | |
| "reward": 0.38678132742643356, | |
| "reward_std": 0.5259938053786755, | |
| "rewards/cosine_scaled_reward": -0.03577600605785847, | |
| "rewards/format_reward": 0.45833334140479565, | |
| "step": 101 | |
| }, | |
| { | |
| "completion_length": 1961.7500305175781, | |
| "epoch": 0.11657142857142858, | |
| "grad_norm": 0.09844502061605453, | |
| "kl": 4.3511390686035156e-05, | |
| "learning_rate": 9.717768952713511e-07, | |
| "loss": 0.0, | |
| "reward": 0.7598011903464794, | |
| "reward_std": 0.43429702706635, | |
| "rewards/cosine_scaled_reward": 0.015317256096750498, | |
| "rewards/format_reward": 0.7291666716337204, | |
| "step": 102 | |
| }, | |
| { | |
| "completion_length": 2702.812536239624, | |
| "epoch": 0.11771428571428572, | |
| "grad_norm": 0.09429902583360672, | |
| "kl": 3.784149885177612e-05, | |
| "learning_rate": 9.706715543782064e-07, | |
| "loss": 0.0, | |
| "reward": 0.18456148356199265, | |
| "reward_std": 0.6498140133917332, | |
| "rewards/cosine_scaled_reward": -0.10563592403195798, | |
| "rewards/format_reward": 0.3958333358168602, | |
| "step": 103 | |
| }, | |
| { | |
| "completion_length": 2509.6666717529297, | |
| "epoch": 0.11885714285714286, | |
| "grad_norm": 0.09157746285200119, | |
| "kl": 3.937631845474243e-05, | |
| "learning_rate": 9.695457105469804e-07, | |
| "loss": 0.0, | |
| "reward": 0.01755199208855629, | |
| "reward_std": 0.4490024037659168, | |
| "rewards/cosine_scaled_reward": -0.1995573453605175, | |
| "rewards/format_reward": 0.4166666679084301, | |
| "step": 104 | |
| }, | |
| { | |
| "completion_length": 2428.437515258789, | |
| "epoch": 0.12, | |
| "grad_norm": 0.07988380640745163, | |
| "kl": 5.3141266107559204e-05, | |
| "learning_rate": 9.683994186497132e-07, | |
| "loss": 0.0, | |
| "reward": 0.35027448181062937, | |
| "reward_std": 0.8200248442590237, | |
| "rewards/cosine_scaled_reward": -0.05402942467480898, | |
| "rewards/format_reward": 0.45833334140479565, | |
| "step": 105 | |
| }, | |
| { | |
| "completion_length": 2179.833366394043, | |
| "epoch": 0.12114285714285715, | |
| "grad_norm": 0.09466643631458282, | |
| "kl": 1.9486993551254272e-05, | |
| "learning_rate": 9.672327345550543e-07, | |
| "loss": 0.0, | |
| "reward": 1.0094754733145237, | |
| "reward_std": 0.8905338924378157, | |
| "rewards/cosine_scaled_reward": 0.17140439338982105, | |
| "rewards/format_reward": 0.666666679084301, | |
| "step": 106 | |
| }, | |
| { | |
| "completion_length": 2793.750045776367, | |
| "epoch": 0.12228571428571429, | |
| "grad_norm": 0.07090191543102264, | |
| "kl": 4.2868778109550476e-05, | |
| "learning_rate": 9.66045715125541e-07, | |
| "loss": 0.0, | |
| "reward": 0.39479758962988853, | |
| "reward_std": 0.5696580000221729, | |
| "rewards/cosine_scaled_reward": -0.031767868902534246, | |
| "rewards/format_reward": 0.45833334140479565, | |
| "step": 107 | |
| }, | |
| { | |
| "completion_length": 2560.8750381469727, | |
| "epoch": 0.12342857142857143, | |
| "grad_norm": 0.09938930720090866, | |
| "kl": 5.4776668548583984e-05, | |
| "learning_rate": 9.648384182148252e-07, | |
| "loss": 0.0, | |
| "reward": 0.28709584288299084, | |
| "reward_std": 0.5398255866020918, | |
| "rewards/cosine_scaled_reward": -0.09603541763499379, | |
| "rewards/format_reward": 0.4791666753590107, | |
| "step": 108 | |
| }, | |
| { | |
| "completion_length": 3012.375015258789, | |
| "epoch": 0.12457142857142857, | |
| "grad_norm": 0.06818244606256485, | |
| "kl": 4.0665268898010254e-05, | |
| "learning_rate": 9.636109026648554e-07, | |
| "loss": 0.0, | |
| "reward": 0.3035507909953594, | |
| "reward_std": 0.46919523924589157, | |
| "rewards/cosine_scaled_reward": -0.0253079435788095, | |
| "rewards/format_reward": 0.3541666716337204, | |
| "step": 109 | |
| }, | |
| { | |
| "completion_length": 2645.2292251586914, | |
| "epoch": 0.12571428571428572, | |
| "grad_norm": 0.08993928879499435, | |
| "kl": 3.9204955101013184e-05, | |
| "learning_rate": 9.623632283030077e-07, | |
| "loss": 0.0, | |
| "reward": 0.4235295169055462, | |
| "reward_std": 1.0312484987080097, | |
| "rewards/cosine_scaled_reward": -0.048651926685124636, | |
| "rewards/format_reward": 0.5208333432674408, | |
| "step": 110 | |
| }, | |
| { | |
| "completion_length": 3142.7708587646484, | |
| "epoch": 0.12685714285714286, | |
| "grad_norm": 0.07124299556016922, | |
| "kl": 5.334615707397461e-05, | |
| "learning_rate": 9.610954559391704e-07, | |
| "loss": 0.0, | |
| "reward": 0.14513955544680357, | |
| "reward_std": 0.5376212261617184, | |
| "rewards/cosine_scaled_reward": -0.07326356694102287, | |
| "rewards/format_reward": 0.2916666679084301, | |
| "step": 111 | |
| }, | |
| { | |
| "completion_length": 3287.937530517578, | |
| "epoch": 0.128, | |
| "grad_norm": 0.05447167903184891, | |
| "kl": 5.59389591217041e-05, | |
| "learning_rate": 9.598076473627796e-07, | |
| "loss": 0.0, | |
| "reward": 0.17502618208527565, | |
| "reward_std": 0.618457704782486, | |
| "rewards/cosine_scaled_reward": -0.06873693224042654, | |
| "rewards/format_reward": 0.3125000037252903, | |
| "step": 112 | |
| }, | |
| { | |
| "completion_length": 2353.687530517578, | |
| "epoch": 0.12914285714285714, | |
| "grad_norm": 0.08908313512802124, | |
| "kl": 5.218386650085449e-05, | |
| "learning_rate": 9.58499865339809e-07, | |
| "loss": 0.0, | |
| "reward": 0.22633516043424606, | |
| "reward_std": 0.6344665475189686, | |
| "rewards/cosine_scaled_reward": -0.1264157413970679, | |
| "rewards/format_reward": 0.4791666716337204, | |
| "step": 113 | |
| }, | |
| { | |
| "completion_length": 2288.7500228881836, | |
| "epoch": 0.13028571428571428, | |
| "grad_norm": 0.08836738020181656, | |
| "kl": 2.9705464839935303e-05, | |
| "learning_rate": 9.571721736097088e-07, | |
| "loss": 0.0, | |
| "reward": 0.22448506485670805, | |
| "reward_std": 0.4999843235127628, | |
| "rewards/cosine_scaled_reward": -0.1794241452589631, | |
| "rewards/format_reward": 0.5833333358168602, | |
| "step": 114 | |
| }, | |
| { | |
| "completion_length": 3045.7500228881836, | |
| "epoch": 0.13142857142857142, | |
| "grad_norm": 0.09010881185531616, | |
| "kl": 5.0008296966552734e-05, | |
| "learning_rate": 9.55824636882301e-07, | |
| "loss": 0.0, | |
| "reward": 0.10089786723256111, | |
| "reward_std": 0.5496318815276027, | |
| "rewards/cosine_scaled_reward": -0.09538440965116024, | |
| "rewards/format_reward": 0.2916666716337204, | |
| "step": 115 | |
| }, | |
| { | |
| "completion_length": 3216.020835876465, | |
| "epoch": 0.13257142857142856, | |
| "grad_norm": 0.06528913974761963, | |
| "kl": 4.798173904418945e-05, | |
| "learning_rate": 9.54457320834625e-07, | |
| "loss": 0.0, | |
| "reward": -0.28314023464918137, | |
| "reward_std": 0.445820776745677, | |
| "rewards/cosine_scaled_reward": -0.20407012477517128, | |
| "rewards/format_reward": 0.125, | |
| "step": 116 | |
| }, | |
| { | |
| "completion_length": 3225.7708892822266, | |
| "epoch": 0.1337142857142857, | |
| "grad_norm": 0.05690770968794823, | |
| "kl": 5.7891011238098145e-05, | |
| "learning_rate": 9.530702921077358e-07, | |
| "loss": 0.0, | |
| "reward": -0.20225957222282887, | |
| "reward_std": 0.5611965917050838, | |
| "rewards/cosine_scaled_reward": -0.22612979169934988, | |
| "rewards/format_reward": 0.25000000186264515, | |
| "step": 117 | |
| }, | |
| { | |
| "completion_length": 3057.0416870117188, | |
| "epoch": 0.13485714285714287, | |
| "grad_norm": 0.06457793712615967, | |
| "kl": 3.30507755279541e-05, | |
| "learning_rate": 9.516636183034564e-07, | |
| "loss": 0.0, | |
| "reward": 0.823002815246582, | |
| "reward_std": 0.7226730212569237, | |
| "rewards/cosine_scaled_reward": 0.18233474646694958, | |
| "rewards/format_reward": 0.4583333395421505, | |
| "step": 118 | |
| }, | |
| { | |
| "completion_length": 2052.770851135254, | |
| "epoch": 0.136, | |
| "grad_norm": 0.1046663373708725, | |
| "kl": 3.627687692642212e-05, | |
| "learning_rate": 9.502373679810839e-07, | |
| "loss": 0.0, | |
| "reward": 0.528248842805624, | |
| "reward_std": 0.6421602182090282, | |
| "rewards/cosine_scaled_reward": -0.0692089144140482, | |
| "rewards/format_reward": 0.666666679084301, | |
| "step": 119 | |
| }, | |
| { | |
| "completion_length": 2901.3333854675293, | |
| "epoch": 0.13714285714285715, | |
| "grad_norm": 0.0866093784570694, | |
| "kl": 4.921481013298035e-05, | |
| "learning_rate": 9.487916106540465e-07, | |
| "loss": 0.0, | |
| "reward": 0.02573934569954872, | |
| "reward_std": 0.47552035516127944, | |
| "rewards/cosine_scaled_reward": -0.1329636573791504, | |
| "rewards/format_reward": 0.2916666716337204, | |
| "step": 120 | |
| }, | |
| { | |
| "completion_length": 2034.520839691162, | |
| "epoch": 0.1382857142857143, | |
| "grad_norm": 0.09373170882463455, | |
| "kl": 5.099177360534668e-05, | |
| "learning_rate": 9.473264167865171e-07, | |
| "loss": 0.0, | |
| "reward": 0.5283471755683422, | |
| "reward_std": 0.5576587021350861, | |
| "rewards/cosine_scaled_reward": -0.027493092231452465, | |
| "rewards/format_reward": 0.5833333358168602, | |
| "step": 121 | |
| }, | |
| { | |
| "completion_length": 2950.208354949951, | |
| "epoch": 0.13942857142857143, | |
| "grad_norm": 0.06989584118127823, | |
| "kl": 5.5477023124694824e-05, | |
| "learning_rate": 9.458418577899774e-07, | |
| "loss": 0.0, | |
| "reward": 0.5161612909287214, | |
| "reward_std": 0.6604084149003029, | |
| "rewards/cosine_scaled_reward": 0.049747309647500515, | |
| "rewards/format_reward": 0.41666667349636555, | |
| "step": 122 | |
| }, | |
| { | |
| "completion_length": 2997.0208740234375, | |
| "epoch": 0.14057142857142857, | |
| "grad_norm": 0.06396129727363586, | |
| "kl": 5.120038986206055e-05, | |
| "learning_rate": 9.443380060197385e-07, | |
| "loss": 0.0, | |
| "reward": 0.05036983895115554, | |
| "reward_std": 0.6437292285263538, | |
| "rewards/cosine_scaled_reward": -0.16231509111821651, | |
| "rewards/format_reward": 0.3750000074505806, | |
| "step": 123 | |
| }, | |
| { | |
| "completion_length": 2201.187545776367, | |
| "epoch": 0.1417142857142857, | |
| "grad_norm": 0.08878038078546524, | |
| "kl": 1.2524425983428955e-05, | |
| "learning_rate": 9.428149347714143e-07, | |
| "loss": 0.0, | |
| "reward": 0.6177411060780287, | |
| "reward_std": 0.5783316306769848, | |
| "rewards/cosine_scaled_reward": 0.027620548382401466, | |
| "rewards/format_reward": 0.5625000074505806, | |
| "step": 124 | |
| }, | |
| { | |
| "completion_length": 2839.7708435058594, | |
| "epoch": 0.14285714285714285, | |
| "grad_norm": 0.09004891663789749, | |
| "kl": 4.132091999053955e-05, | |
| "learning_rate": 9.412727182773486e-07, | |
| "loss": 0.0, | |
| "reward": 0.3123480398207903, | |
| "reward_std": 0.6807647086679935, | |
| "rewards/cosine_scaled_reward": -7.599778473377228e-05, | |
| "rewards/format_reward": 0.3125, | |
| "step": 125 | |
| }, | |
| { | |
| "completion_length": 2775.2291870117188, | |
| "epoch": 0.144, | |
| "grad_norm": 0.06807711720466614, | |
| "kl": 4.363059997558594e-05, | |
| "learning_rate": 9.397114317029974e-07, | |
| "loss": 0.0, | |
| "reward": 0.255770493298769, | |
| "reward_std": 0.6645963629707694, | |
| "rewards/cosine_scaled_reward": -0.05961475125513971, | |
| "rewards/format_reward": 0.3750000037252903, | |
| "step": 126 | |
| }, | |
| { | |
| "completion_length": 3555.2291870117188, | |
| "epoch": 0.14514285714285713, | |
| "grad_norm": 0.05394596606492996, | |
| "kl": 4.8354268074035645e-05, | |
| "learning_rate": 9.381311511432658e-07, | |
| "loss": 0.0, | |
| "reward": -0.4153820718638599, | |
| "reward_std": 0.3901064973324537, | |
| "rewards/cosine_scaled_reward": -0.2493577003479004, | |
| "rewards/format_reward": 0.0833333358168602, | |
| "step": 127 | |
| }, | |
| { | |
| "completion_length": 2573.5417137145996, | |
| "epoch": 0.1462857142857143, | |
| "grad_norm": 0.0834243968129158, | |
| "kl": 3.619492053985596e-05, | |
| "learning_rate": 9.36531953618799e-07, | |
| "loss": 0.0, | |
| "reward": 0.8157168254256248, | |
| "reward_std": 0.5830879472196102, | |
| "rewards/cosine_scaled_reward": 0.11619172617793083, | |
| "rewards/format_reward": 0.5833333432674408, | |
| "step": 128 | |
| }, | |
| { | |
| "completion_length": 3269.125, | |
| "epoch": 0.14742857142857144, | |
| "grad_norm": 0.061948034912347794, | |
| "kl": 4.8995018005371094e-05, | |
| "learning_rate": 9.34913917072228e-07, | |
| "loss": 0.0, | |
| "reward": -0.3414502330124378, | |
| "reward_std": 0.430474242195487, | |
| "rewards/cosine_scaled_reward": -0.24364178627729416, | |
| "rewards/format_reward": 0.14583333395421505, | |
| "step": 129 | |
| }, | |
| { | |
| "completion_length": 3239.0833435058594, | |
| "epoch": 0.14857142857142858, | |
| "grad_norm": 0.0639168992638588, | |
| "kl": 4.304945468902588e-05, | |
| "learning_rate": 9.332771203643714e-07, | |
| "loss": 0.0, | |
| "reward": -0.08573511304712156, | |
| "reward_std": 0.6089969128370285, | |
| "rewards/cosine_scaled_reward": -0.12620089296251535, | |
| "rewards/format_reward": 0.1666666679084301, | |
| "step": 130 | |
| }, | |
| { | |
| "completion_length": 2743.6250228881836, | |
| "epoch": 0.14971428571428572, | |
| "grad_norm": 0.0986771509051323, | |
| "kl": 5.759298801422119e-05, | |
| "learning_rate": 9.316216432703916e-07, | |
| "loss": 0.0, | |
| "reward": 0.5349823525175452, | |
| "reward_std": 0.8021712291520089, | |
| "rewards/cosine_scaled_reward": 0.0487411692738533, | |
| "rewards/format_reward": 0.43750000558793545, | |
| "step": 131 | |
| }, | |
| { | |
| "completion_length": 2729.729179382324, | |
| "epoch": 0.15085714285714286, | |
| "grad_norm": 0.07586528360843658, | |
| "kl": 3.489106893539429e-05, | |
| "learning_rate": 9.299475664759068e-07, | |
| "loss": 0.0, | |
| "reward": 0.2761177532374859, | |
| "reward_std": 0.5243139378726482, | |
| "rewards/cosine_scaled_reward": -0.04944112920202315, | |
| "rewards/format_reward": 0.375, | |
| "step": 132 | |
| }, | |
| { | |
| "completion_length": 3297.0625, | |
| "epoch": 0.152, | |
| "grad_norm": 0.05855449289083481, | |
| "kl": 4.342198371887207e-05, | |
| "learning_rate": 9.282549715730579e-07, | |
| "loss": 0.0, | |
| "reward": -0.08109603449702263, | |
| "reward_std": 0.6278338022530079, | |
| "rewards/cosine_scaled_reward": -0.1655480198096484, | |
| "rewards/format_reward": 0.2500000037252903, | |
| "step": 133 | |
| }, | |
| { | |
| "completion_length": 2581.8958435058594, | |
| "epoch": 0.15314285714285714, | |
| "grad_norm": 0.09094421565532684, | |
| "kl": 6.553530693054199e-05, | |
| "learning_rate": 9.265439410565328e-07, | |
| "loss": 0.0, | |
| "reward": 0.49103792384266853, | |
| "reward_std": 0.6567734470590949, | |
| "rewards/cosine_scaled_reward": 0.01635227305814624, | |
| "rewards/format_reward": 0.4583333358168602, | |
| "step": 134 | |
| }, | |
| { | |
| "completion_length": 1872.0417022705078, | |
| "epoch": 0.15428571428571428, | |
| "grad_norm": 0.1057426780462265, | |
| "kl": 3.15885990858078e-05, | |
| "learning_rate": 9.248145583195447e-07, | |
| "loss": 0.0, | |
| "reward": 1.1517054475843906, | |
| "reward_std": 0.8242154731415212, | |
| "rewards/cosine_scaled_reward": 0.2321027358993888, | |
| "rewards/format_reward": 0.6875000055879354, | |
| "step": 135 | |
| }, | |
| { | |
| "completion_length": 2674.916679382324, | |
| "epoch": 0.15542857142857142, | |
| "grad_norm": 0.06912790238857269, | |
| "kl": 2.880394458770752e-05, | |
| "learning_rate": 9.230669076497687e-07, | |
| "loss": 0.0, | |
| "reward": 0.47026100382208824, | |
| "reward_std": 1.0016871318221092, | |
| "rewards/cosine_scaled_reward": 0.016380503308027983, | |
| "rewards/format_reward": 0.43750000558793545, | |
| "step": 136 | |
| }, | |
| { | |
| "completion_length": 3101.187515258789, | |
| "epoch": 0.15657142857142858, | |
| "grad_norm": 0.06627309322357178, | |
| "kl": 3.966689109802246e-05, | |
| "learning_rate": 9.213010742252327e-07, | |
| "loss": 0.0, | |
| "reward": -0.11830158345401287, | |
| "reward_std": 0.5538353957235813, | |
| "rewards/cosine_scaled_reward": -0.19456746615469456, | |
| "rewards/format_reward": 0.27083333767950535, | |
| "step": 137 | |
| }, | |
| { | |
| "completion_length": 2779.708351135254, | |
| "epoch": 0.15771428571428572, | |
| "grad_norm": 0.062483008950948715, | |
| "kl": 3.4242868423461914e-05, | |
| "learning_rate": 9.195171441101668e-07, | |
| "loss": 0.0, | |
| "reward": 0.1503081377595663, | |
| "reward_std": 0.6015846244990826, | |
| "rewards/cosine_scaled_reward": -0.13317927159368992, | |
| "rewards/format_reward": 0.41666666977107525, | |
| "step": 138 | |
| }, | |
| { | |
| "completion_length": 2941.354202270508, | |
| "epoch": 0.15885714285714286, | |
| "grad_norm": 0.07116085290908813, | |
| "kl": 4.464387893676758e-05, | |
| "learning_rate": 9.177152042508077e-07, | |
| "loss": 0.0, | |
| "reward": 0.4298251010477543, | |
| "reward_std": 1.0214788615703583, | |
| "rewards/cosine_scaled_reward": -0.014254124835133553, | |
| "rewards/format_reward": 0.4583333395421505, | |
| "step": 139 | |
| }, | |
| { | |
| "completion_length": 3082.7083587646484, | |
| "epoch": 0.16, | |
| "grad_norm": 0.0749194547533989, | |
| "kl": 4.2766332626342773e-05, | |
| "learning_rate": 9.158953424711624e-07, | |
| "loss": 0.0, | |
| "reward": 0.04133862629532814, | |
| "reward_std": 0.412425871938467, | |
| "rewards/cosine_scaled_reward": -0.08349735150113702, | |
| "rewards/format_reward": 0.20833333395421505, | |
| "step": 140 | |
| }, | |
| { | |
| "completion_length": 3278.8750610351562, | |
| "epoch": 0.16114285714285714, | |
| "grad_norm": 0.05537456274032593, | |
| "kl": 4.363059997558594e-05, | |
| "learning_rate": 9.140576474687263e-07, | |
| "loss": 0.0, | |
| "reward": -0.13458121148869395, | |
| "reward_std": 0.6288429368287325, | |
| "rewards/cosine_scaled_reward": -0.2027072822675109, | |
| "rewards/format_reward": 0.27083334140479565, | |
| "step": 141 | |
| }, | |
| { | |
| "completion_length": 2582.1250762939453, | |
| "epoch": 0.16228571428571428, | |
| "grad_norm": 0.07796003669500351, | |
| "kl": 4.741549491882324e-05, | |
| "learning_rate": 9.122022088101613e-07, | |
| "loss": 0.0, | |
| "reward": 0.5442043896764517, | |
| "reward_std": 0.9210255593061447, | |
| "rewards/cosine_scaled_reward": -0.02998113678768277, | |
| "rewards/format_reward": 0.6041666828095913, | |
| "step": 142 | |
| }, | |
| { | |
| "completion_length": 2632.125068664551, | |
| "epoch": 0.16342857142857142, | |
| "grad_norm": 0.07705174386501312, | |
| "kl": 4.462897777557373e-05, | |
| "learning_rate": 9.103291169269299e-07, | |
| "loss": 0.0, | |
| "reward": 0.09298153035342693, | |
| "reward_std": 0.8049650602042675, | |
| "rewards/cosine_scaled_reward": -0.1722592432051897, | |
| "rewards/format_reward": 0.4375000111758709, | |
| "step": 143 | |
| }, | |
| { | |
| "completion_length": 3101.6875228881836, | |
| "epoch": 0.16457142857142856, | |
| "grad_norm": 0.08543463796377182, | |
| "kl": 5.1856040954589844e-05, | |
| "learning_rate": 9.084384631108882e-07, | |
| "loss": 0.0, | |
| "reward": 0.0988054908812046, | |
| "reward_std": 0.7796483170241117, | |
| "rewards/cosine_scaled_reward": -0.0755972620099783, | |
| "rewards/format_reward": 0.25, | |
| "step": 144 | |
| }, | |
| { | |
| "completion_length": 2303.4791870117188, | |
| "epoch": 0.1657142857142857, | |
| "grad_norm": 0.11068180948495865, | |
| "kl": 3.673136234283447e-05, | |
| "learning_rate": 9.065303395098358e-07, | |
| "loss": 0.0, | |
| "reward": 0.3519397974014282, | |
| "reward_std": 0.46208097971975803, | |
| "rewards/cosine_scaled_reward": -0.07403010502457619, | |
| "rewards/format_reward": 0.5, | |
| "step": 145 | |
| }, | |
| { | |
| "completion_length": 2524.895866394043, | |
| "epoch": 0.16685714285714287, | |
| "grad_norm": 0.07554313540458679, | |
| "kl": 3.3855438232421875e-05, | |
| "learning_rate": 9.046048391230247e-07, | |
| "loss": 0.0, | |
| "reward": 0.19796129129827023, | |
| "reward_std": 0.5362901501357555, | |
| "rewards/cosine_scaled_reward": -0.16143603064119816, | |
| "rewards/format_reward": 0.5208333395421505, | |
| "step": 146 | |
| }, | |
| { | |
| "completion_length": 3512.9583740234375, | |
| "epoch": 0.168, | |
| "grad_norm": 0.05464346334338188, | |
| "kl": 4.965066909790039e-05, | |
| "learning_rate": 9.026620557966279e-07, | |
| "loss": 0.0, | |
| "reward": 0.05223717913031578, | |
| "reward_std": 0.6960646230727434, | |
| "rewards/cosine_scaled_reward": -0.07804810348898172, | |
| "rewards/format_reward": 0.2083333395421505, | |
| "step": 147 | |
| }, | |
| { | |
| "completion_length": 2805.6458587646484, | |
| "epoch": 0.16914285714285715, | |
| "grad_norm": 0.05980600044131279, | |
| "kl": 2.8684735298156738e-05, | |
| "learning_rate": 9.007020842191634e-07, | |
| "loss": 0.0, | |
| "reward": 0.2592185221146792, | |
| "reward_std": 0.4343179054558277, | |
| "rewards/cosine_scaled_reward": -0.09955742349848151, | |
| "rewards/format_reward": 0.45833334885537624, | |
| "step": 148 | |
| }, | |
| { | |
| "completion_length": 2796.666679382324, | |
| "epoch": 0.1702857142857143, | |
| "grad_norm": 0.06559307128190994, | |
| "kl": 3.460794687271118e-05, | |
| "learning_rate": 8.987250199168808e-07, | |
| "loss": 0.0, | |
| "reward": 0.36950669437646866, | |
| "reward_std": 0.5187438875436783, | |
| "rewards/cosine_scaled_reward": -0.033996658865362406, | |
| "rewards/format_reward": 0.4375, | |
| "step": 149 | |
| }, | |
| { | |
| "completion_length": 2662.500015258789, | |
| "epoch": 0.17142857142857143, | |
| "grad_norm": 0.09143386781215668, | |
| "kl": 5.4642558097839355e-05, | |
| "learning_rate": 8.967309592491052e-07, | |
| "loss": 0.0, | |
| "reward": 0.1737375184893608, | |
| "reward_std": 0.7364421226084232, | |
| "rewards/cosine_scaled_reward": -0.1214645592845045, | |
| "rewards/format_reward": 0.4166666716337204, | |
| "step": 150 | |
| }, | |
| { | |
| "completion_length": 2704.0000762939453, | |
| "epoch": 0.17257142857142857, | |
| "grad_norm": 0.08034959435462952, | |
| "kl": 5.844235420227051e-05, | |
| "learning_rate": 8.9471999940354e-07, | |
| "loss": 0.0, | |
| "reward": 0.5385458022356033, | |
| "reward_std": 0.9289969503879547, | |
| "rewards/cosine_scaled_reward": 0.029689552262425423, | |
| "rewards/format_reward": 0.47916667349636555, | |
| "step": 151 | |
| }, | |
| { | |
| "completion_length": 3037.3541984558105, | |
| "epoch": 0.1737142857142857, | |
| "grad_norm": 0.08508548885583878, | |
| "kl": 4.589557647705078e-05, | |
| "learning_rate": 8.926922383915315e-07, | |
| "loss": 0.0, | |
| "reward": -0.20604137517511845, | |
| "reward_std": 0.4984567780047655, | |
| "rewards/cosine_scaled_reward": -0.2384373564273119, | |
| "rewards/format_reward": 0.2708333395421505, | |
| "step": 152 | |
| }, | |
| { | |
| "completion_length": 2886.4583702087402, | |
| "epoch": 0.17485714285714285, | |
| "grad_norm": 0.07923610508441925, | |
| "kl": 4.869699478149414e-05, | |
| "learning_rate": 8.906477750432903e-07, | |
| "loss": 0.0, | |
| "reward": -0.02076043374836445, | |
| "reward_std": 0.5566474124789238, | |
| "rewards/cosine_scaled_reward": -0.1666302215307951, | |
| "rewards/format_reward": 0.31250000186264515, | |
| "step": 153 | |
| }, | |
| { | |
| "completion_length": 3191.666717529297, | |
| "epoch": 0.176, | |
| "grad_norm": 0.054210711270570755, | |
| "kl": 3.435462713241577e-05, | |
| "learning_rate": 8.88586709003076e-07, | |
| "loss": 0.0, | |
| "reward": 0.3346288048196584, | |
| "reward_std": 0.766063136048615, | |
| "rewards/cosine_scaled_reward": -0.009768943302333355, | |
| "rewards/format_reward": 0.35416667349636555, | |
| "step": 154 | |
| }, | |
| { | |
| "completion_length": 2445.3541717529297, | |
| "epoch": 0.17714285714285713, | |
| "grad_norm": 0.09181364625692368, | |
| "kl": 4.544854164123535e-05, | |
| "learning_rate": 8.865091407243394e-07, | |
| "loss": 0.0, | |
| "reward": 0.47494749538600445, | |
| "reward_std": 0.5804790798574686, | |
| "rewards/cosine_scaled_reward": 0.029140407219529152, | |
| "rewards/format_reward": 0.4166666679084301, | |
| "step": 155 | |
| }, | |
| { | |
| "completion_length": 2863.2500076293945, | |
| "epoch": 0.1782857142857143, | |
| "grad_norm": 0.06354030966758728, | |
| "kl": 3.014504909515381e-05, | |
| "learning_rate": 8.844151714648274e-07, | |
| "loss": 0.0, | |
| "reward": 0.14422506093978882, | |
| "reward_std": 0.6923067793250084, | |
| "rewards/cosine_scaled_reward": -0.08413747791200876, | |
| "rewards/format_reward": 0.3125000074505806, | |
| "step": 156 | |
| }, | |
| { | |
| "completion_length": 3136.7083740234375, | |
| "epoch": 0.17942857142857144, | |
| "grad_norm": 0.057844046503305435, | |
| "kl": 4.6854838728904724e-05, | |
| "learning_rate": 8.823049032816478e-07, | |
| "loss": 0.0, | |
| "reward": -0.13818035274744034, | |
| "reward_std": 0.5338004231452942, | |
| "rewards/cosine_scaled_reward": -0.22534017451107502, | |
| "rewards/format_reward": 0.3125000074505806, | |
| "step": 157 | |
| }, | |
| { | |
| "completion_length": 2349.0000534057617, | |
| "epoch": 0.18057142857142858, | |
| "grad_norm": 0.08436752110719681, | |
| "kl": 4.320591688156128e-05, | |
| "learning_rate": 8.801784390262943e-07, | |
| "loss": 0.0, | |
| "reward": 0.6595811834558845, | |
| "reward_std": 0.6294469237327576, | |
| "rewards/cosine_scaled_reward": 0.04854058846831322, | |
| "rewards/format_reward": 0.5625000074505806, | |
| "step": 158 | |
| }, | |
| { | |
| "completion_length": 3213.8958892822266, | |
| "epoch": 0.18171428571428572, | |
| "grad_norm": 0.054091677069664, | |
| "kl": 3.6016106605529785e-05, | |
| "learning_rate": 8.780358823396352e-07, | |
| "loss": 0.0, | |
| "reward": -0.21697133779525757, | |
| "reward_std": 0.6304801269434392, | |
| "rewards/cosine_scaled_reward": -0.23348568379878998, | |
| "rewards/format_reward": 0.25000000186264515, | |
| "step": 159 | |
| }, | |
| { | |
| "completion_length": 3048.812515258789, | |
| "epoch": 0.18285714285714286, | |
| "grad_norm": 0.06472987681627274, | |
| "kl": 4.5880675315856934e-05, | |
| "learning_rate": 8.758773376468604e-07, | |
| "loss": 0.0, | |
| "reward": 0.16828365437686443, | |
| "reward_std": 0.7897966802120209, | |
| "rewards/cosine_scaled_reward": -0.040858184336684644, | |
| "rewards/format_reward": 0.25000000186264515, | |
| "step": 160 | |
| }, | |
| { | |
| "completion_length": 1999.895866394043, | |
| "epoch": 0.184, | |
| "grad_norm": 0.09538356959819794, | |
| "kl": 6.414949893951416e-05, | |
| "learning_rate": 8.737029101523929e-07, | |
| "loss": 0.0, | |
| "reward": 0.40960137266665697, | |
| "reward_std": 0.6267556976526976, | |
| "rewards/cosine_scaled_reward": -0.14936598390340805, | |
| "rewards/format_reward": 0.7083333395421505, | |
| "step": 161 | |
| }, | |
| { | |
| "completion_length": 3483.4583435058594, | |
| "epoch": 0.18514285714285714, | |
| "grad_norm": 0.05956606566905975, | |
| "kl": 4.011392593383789e-05, | |
| "learning_rate": 8.715127058347614e-07, | |
| "loss": 0.0, | |
| "reward": -0.20746187027543783, | |
| "reward_std": 0.7284103976562619, | |
| "rewards/cosine_scaled_reward": -0.15581427235156298, | |
| "rewards/format_reward": 0.10416666977107525, | |
| "step": 162 | |
| }, | |
| { | |
| "completion_length": 2542.541679382324, | |
| "epoch": 0.18628571428571428, | |
| "grad_norm": 0.08322984725236893, | |
| "kl": 3.534182906150818e-05, | |
| "learning_rate": 8.693068314414344e-07, | |
| "loss": 0.0, | |
| "reward": 0.7424244098365307, | |
| "reward_std": 0.6049519426887855, | |
| "rewards/cosine_scaled_reward": 0.11079552816227078, | |
| "rewards/format_reward": 0.5208333395421505, | |
| "step": 163 | |
| }, | |
| { | |
| "completion_length": 2514.6458587646484, | |
| "epoch": 0.18742857142857142, | |
| "grad_norm": 0.08033201843500137, | |
| "kl": 4.0881335735321045e-05, | |
| "learning_rate": 8.670853944836176e-07, | |
| "loss": 0.0, | |
| "reward": 0.4948175232857466, | |
| "reward_std": 0.7647457309067249, | |
| "rewards/cosine_scaled_reward": 0.01824208628386259, | |
| "rewards/format_reward": 0.45833334140479565, | |
| "step": 164 | |
| }, | |
| { | |
| "completion_length": 3301.729202270508, | |
| "epoch": 0.18857142857142858, | |
| "grad_norm": 0.05642946809530258, | |
| "kl": 4.70578670501709e-05, | |
| "learning_rate": 8.648485032310144e-07, | |
| "loss": 0.0, | |
| "reward": -0.14827431086450815, | |
| "reward_std": 0.7151197902858257, | |
| "rewards/cosine_scaled_reward": -0.16788716241717339, | |
| "rewards/format_reward": 0.18750000558793545, | |
| "step": 165 | |
| }, | |
| { | |
| "completion_length": 2874.687545776367, | |
| "epoch": 0.18971428571428572, | |
| "grad_norm": 0.07232528924942017, | |
| "kl": 3.822147846221924e-05, | |
| "learning_rate": 8.625962667065487e-07, | |
| "loss": 0.0, | |
| "reward": 0.07571477070450783, | |
| "reward_std": 0.6265546716749668, | |
| "rewards/cosine_scaled_reward": -0.13922595232725143, | |
| "rewards/format_reward": 0.3541666679084301, | |
| "step": 166 | |
| }, | |
| { | |
| "completion_length": 2468.500030517578, | |
| "epoch": 0.19085714285714286, | |
| "grad_norm": 0.0896337628364563, | |
| "kl": 3.795325756072998e-05, | |
| "learning_rate": 8.603287946810513e-07, | |
| "loss": 0.0, | |
| "reward": 0.5002969466149807, | |
| "reward_std": 0.8546273484826088, | |
| "rewards/cosine_scaled_reward": -0.010268212645314634, | |
| "rewards/format_reward": 0.5208333376795053, | |
| "step": 167 | |
| }, | |
| { | |
| "completion_length": 3338.854217529297, | |
| "epoch": 0.192, | |
| "grad_norm": 0.05132433772087097, | |
| "kl": 3.501772880554199e-05, | |
| "learning_rate": 8.580461976679099e-07, | |
| "loss": 0.0, | |
| "reward": 0.10490168258547783, | |
| "reward_std": 0.7663179654628038, | |
| "rewards/cosine_scaled_reward": -0.1037991689518094, | |
| "rewards/format_reward": 0.31250000931322575, | |
| "step": 168 | |
| }, | |
| { | |
| "completion_length": 2664.0208740234375, | |
| "epoch": 0.19314285714285714, | |
| "grad_norm": 0.0617351271212101, | |
| "kl": 2.8446316719055176e-05, | |
| "learning_rate": 8.557485869176825e-07, | |
| "loss": 0.0, | |
| "reward": 0.9187318980693817, | |
| "reward_std": 0.6395005024969578, | |
| "rewards/cosine_scaled_reward": 0.18853259086608887, | |
| "rewards/format_reward": 0.5416666679084301, | |
| "step": 169 | |
| }, | |
| { | |
| "completion_length": 2732.2083702087402, | |
| "epoch": 0.19428571428571428, | |
| "grad_norm": 0.08002173900604248, | |
| "kl": 3.436952829360962e-05, | |
| "learning_rate": 8.534360744126753e-07, | |
| "loss": 0.0, | |
| "reward": 0.32697401847690344, | |
| "reward_std": 0.40401430800557137, | |
| "rewards/cosine_scaled_reward": -0.034429668448865414, | |
| "rewards/format_reward": 0.3958333358168602, | |
| "step": 170 | |
| }, | |
| { | |
| "completion_length": 2504.4792098999023, | |
| "epoch": 0.19542857142857142, | |
| "grad_norm": 0.08919814229011536, | |
| "kl": 4.845857620239258e-05, | |
| "learning_rate": 8.511087728614862e-07, | |
| "loss": 0.0, | |
| "reward": 0.6028359653428197, | |
| "reward_std": 0.5255442671477795, | |
| "rewards/cosine_scaled_reward": 0.04100130870938301, | |
| "rewards/format_reward": 0.5208333395421505, | |
| "step": 171 | |
| }, | |
| { | |
| "completion_length": 2834.520866394043, | |
| "epoch": 0.19657142857142856, | |
| "grad_norm": 0.09267444163560867, | |
| "kl": 4.383176565170288e-05, | |
| "learning_rate": 8.487667956935087e-07, | |
| "loss": 0.0, | |
| "reward": 0.4749290198087692, | |
| "reward_std": 0.5056496858596802, | |
| "rewards/cosine_scaled_reward": 0.060381168499588966, | |
| "rewards/format_reward": 0.35416666977107525, | |
| "step": 172 | |
| }, | |
| { | |
| "completion_length": 2033.166690826416, | |
| "epoch": 0.1977142857142857, | |
| "grad_norm": 0.1092362254858017, | |
| "kl": 3.1575560569763184e-05, | |
| "learning_rate": 8.464102570534061e-07, | |
| "loss": 0.0, | |
| "reward": 0.24639383889734745, | |
| "reward_std": 0.5247251307591796, | |
| "rewards/cosine_scaled_reward": -0.1476364117115736, | |
| "rewards/format_reward": 0.5416666679084301, | |
| "step": 173 | |
| }, | |
| { | |
| "completion_length": 2714.7083892822266, | |
| "epoch": 0.19885714285714284, | |
| "grad_norm": 0.07081694900989532, | |
| "kl": 3.2141804695129395e-05, | |
| "learning_rate": 8.440392717955475e-07, | |
| "loss": 0.0, | |
| "reward": 0.27326123882085085, | |
| "reward_std": 0.7056470420211554, | |
| "rewards/cosine_scaled_reward": -0.09253604430705309, | |
| "rewards/format_reward": 0.45833334140479565, | |
| "step": 174 | |
| }, | |
| { | |
| "completion_length": 2794.6458740234375, | |
| "epoch": 0.2, | |
| "grad_norm": 0.07309149205684662, | |
| "kl": 3.9517879486083984e-05, | |
| "learning_rate": 8.416539554784089e-07, | |
| "loss": 0.0, | |
| "reward": 0.5578448977321386, | |
| "reward_std": 0.5458577377721667, | |
| "rewards/cosine_scaled_reward": 0.0705891028046608, | |
| "rewards/format_reward": 0.4166666716337204, | |
| "step": 175 | |
| }, | |
| { | |
| "completion_length": 2739.4791984558105, | |
| "epoch": 0.20114285714285715, | |
| "grad_norm": 0.09856078773736954, | |
| "kl": 4.255026578903198e-05, | |
| "learning_rate": 8.392544243589427e-07, | |
| "loss": 0.0, | |
| "reward": 0.32856111600995064, | |
| "reward_std": 0.9141727909445763, | |
| "rewards/cosine_scaled_reward": -0.06488611380336806, | |
| "rewards/format_reward": 0.45833333767950535, | |
| "step": 176 | |
| }, | |
| { | |
| "completion_length": 2937.875045776367, | |
| "epoch": 0.2022857142857143, | |
| "grad_norm": 0.08212530612945557, | |
| "kl": 4.664808511734009e-05, | |
| "learning_rate": 8.368407953869103e-07, | |
| "loss": 0.0, | |
| "reward": 0.14141926169395447, | |
| "reward_std": 0.5619379281997681, | |
| "rewards/cosine_scaled_reward": -0.13762371242046356, | |
| "rewards/format_reward": 0.416666679084301, | |
| "step": 177 | |
| }, | |
| { | |
| "completion_length": 2174.229179382324, | |
| "epoch": 0.20342857142857143, | |
| "grad_norm": 0.09099788963794708, | |
| "kl": 4.07099723815918e-05, | |
| "learning_rate": 8.344131861991828e-07, | |
| "loss": 0.0, | |
| "reward": 0.6008040135493502, | |
| "reward_std": 0.8306377530097961, | |
| "rewards/cosine_scaled_reward": 0.008735324256122112, | |
| "rewards/format_reward": 0.5833333414047956, | |
| "step": 178 | |
| }, | |
| { | |
| "completion_length": 3044.7916717529297, | |
| "epoch": 0.20457142857142857, | |
| "grad_norm": 0.06454001367092133, | |
| "kl": 2.7177389711141586e-05, | |
| "learning_rate": 8.319717151140072e-07, | |
| "loss": 0.0, | |
| "reward": -0.08463140577077866, | |
| "reward_std": 0.5380531623959541, | |
| "rewards/cosine_scaled_reward": -0.18814902938902378, | |
| "rewards/format_reward": 0.2916666679084301, | |
| "step": 179 | |
| }, | |
| { | |
| "completion_length": 2292.2500228881836, | |
| "epoch": 0.2057142857142857, | |
| "grad_norm": 0.10045332461595535, | |
| "kl": 4.4990330934524536e-05, | |
| "learning_rate": 8.295165011252396e-07, | |
| "loss": 0.0, | |
| "reward": 0.7865173332393169, | |
| "reward_std": 0.5815966906957328, | |
| "rewards/cosine_scaled_reward": 0.09117531962692738, | |
| "rewards/format_reward": 0.6041666716337204, | |
| "step": 180 | |
| }, | |
| { | |
| "completion_length": 3153.625, | |
| "epoch": 0.20685714285714285, | |
| "grad_norm": 0.07008782029151917, | |
| "kl": 3.886967897415161e-05, | |
| "learning_rate": 8.270476638965461e-07, | |
| "loss": 0.0, | |
| "reward": -0.07918012514710426, | |
| "reward_std": 0.4492311589419842, | |
| "rewards/cosine_scaled_reward": -0.14375673606991768, | |
| "rewards/format_reward": 0.2083333358168602, | |
| "step": 181 | |
| }, | |
| { | |
| "completion_length": 2131.8750762939453, | |
| "epoch": 0.208, | |
| "grad_norm": 0.07609304040670395, | |
| "kl": 3.762543201446533e-05, | |
| "learning_rate": 8.245653237555705e-07, | |
| "loss": 0.0, | |
| "reward": 0.3937217202037573, | |
| "reward_std": 0.810230128467083, | |
| "rewards/cosine_scaled_reward": -0.09480581805109978, | |
| "rewards/format_reward": 0.5833333414047956, | |
| "step": 182 | |
| }, | |
| { | |
| "completion_length": 1924.5000457763672, | |
| "epoch": 0.20914285714285713, | |
| "grad_norm": 0.11205761879682541, | |
| "kl": 4.197657108306885e-05, | |
| "learning_rate": 8.220696016880687e-07, | |
| "loss": 0.0, | |
| "reward": 0.4191586971282959, | |
| "reward_std": 0.46561193093657494, | |
| "rewards/cosine_scaled_reward": -0.11333732306957245, | |
| "rewards/format_reward": 0.645833333954215, | |
| "step": 183 | |
| }, | |
| { | |
| "completion_length": 2947.333335876465, | |
| "epoch": 0.2102857142857143, | |
| "grad_norm": 0.1073969155550003, | |
| "kl": 4.264339804649353e-05, | |
| "learning_rate": 8.195606193320136e-07, | |
| "loss": 0.0, | |
| "reward": -0.16777715273201466, | |
| "reward_std": 0.4177204091101885, | |
| "rewards/cosine_scaled_reward": -0.21930524613708258, | |
| "rewards/format_reward": 0.2708333395421505, | |
| "step": 184 | |
| }, | |
| { | |
| "completion_length": 2831.500011444092, | |
| "epoch": 0.21142857142857144, | |
| "grad_norm": 0.15596449375152588, | |
| "kl": 3.0463095754384995e-05, | |
| "learning_rate": 8.170384989716657e-07, | |
| "loss": 0.0, | |
| "reward": 0.02601803746074438, | |
| "reward_std": 0.53738350328058, | |
| "rewards/cosine_scaled_reward": -0.15365764708258212, | |
| "rewards/format_reward": 0.33333334140479565, | |
| "step": 185 | |
| }, | |
| { | |
| "completion_length": 3090.0416870117188, | |
| "epoch": 0.21257142857142858, | |
| "grad_norm": 0.0567559190094471, | |
| "kl": 3.3606134820729494e-05, | |
| "learning_rate": 8.145033635316128e-07, | |
| "loss": 0.0, | |
| "reward": 0.5229375399649143, | |
| "reward_std": 0.6047189794480801, | |
| "rewards/cosine_scaled_reward": 0.07396874204277992, | |
| "rewards/format_reward": 0.37500000558793545, | |
| "step": 186 | |
| }, | |
| { | |
| "completion_length": 2450.625030517578, | |
| "epoch": 0.21371428571428572, | |
| "grad_norm": 0.1064734235405922, | |
| "kl": 4.59328293800354e-05, | |
| "learning_rate": 8.119553365707802e-07, | |
| "loss": 0.0, | |
| "reward": 0.01848135655745864, | |
| "reward_std": 0.5846540145576, | |
| "rewards/cosine_scaled_reward": -0.1990926619619131, | |
| "rewards/format_reward": 0.41666666977107525, | |
| "step": 187 | |
| }, | |
| { | |
| "completion_length": 3492.1041870117188, | |
| "epoch": 0.21485714285714286, | |
| "grad_norm": 0.054533593356609344, | |
| "kl": 7.897987961769104e-05, | |
| "learning_rate": 8.093945422764069e-07, | |
| "loss": 0.0, | |
| "reward": -0.09857597388327122, | |
| "reward_std": 0.6068163029849529, | |
| "rewards/cosine_scaled_reward": -0.11178799159824848, | |
| "rewards/format_reward": 0.1250000037252903, | |
| "step": 188 | |
| }, | |
| { | |
| "completion_length": 2381.1458587646484, | |
| "epoch": 0.216, | |
| "grad_norm": 0.07699088007211685, | |
| "kl": 3.7007033824920654e-05, | |
| "learning_rate": 8.068211054579943e-07, | |
| "loss": 0.0, | |
| "reward": 0.18771117678261362, | |
| "reward_std": 0.68030871078372, | |
| "rewards/cosine_scaled_reward": -0.17697775922715664, | |
| "rewards/format_reward": 0.5416666772216558, | |
| "step": 189 | |
| }, | |
| { | |
| "completion_length": 2825.7083435058594, | |
| "epoch": 0.21714285714285714, | |
| "grad_norm": 0.06694263964891434, | |
| "kl": 2.7514994144439697e-05, | |
| "learning_rate": 8.04235151541222e-07, | |
| "loss": 0.0, | |
| "reward": 0.24774927645921707, | |
| "reward_std": 0.48048258759081364, | |
| "rewards/cosine_scaled_reward": -0.06362534500658512, | |
| "rewards/format_reward": 0.375, | |
| "step": 190 | |
| }, | |
| { | |
| "completion_length": 2449.9791870117188, | |
| "epoch": 0.21828571428571428, | |
| "grad_norm": 0.09662597626447678, | |
| "kl": 5.513429641723633e-05, | |
| "learning_rate": 8.01636806561836e-07, | |
| "loss": 0.0, | |
| "reward": 0.3565596635453403, | |
| "reward_std": 0.6051553636789322, | |
| "rewards/cosine_scaled_reward": -0.05088683310896158, | |
| "rewards/format_reward": 0.45833333395421505, | |
| "step": 191 | |
| }, | |
| { | |
| "completion_length": 3399.2916870117188, | |
| "epoch": 0.21942857142857142, | |
| "grad_norm": 0.05350002646446228, | |
| "kl": 2.376548945903778e-05, | |
| "learning_rate": 7.990261971595048e-07, | |
| "loss": 0.0, | |
| "reward": -0.17050620168447495, | |
| "reward_std": 0.5745283327996731, | |
| "rewards/cosine_scaled_reward": -0.18941976875066757, | |
| "rewards/format_reward": 0.2083333395421505, | |
| "step": 192 | |
| }, | |
| { | |
| "completion_length": 2853.1458587646484, | |
| "epoch": 0.22057142857142858, | |
| "grad_norm": 0.07139981538057327, | |
| "kl": 4.512816667556763e-05, | |
| "learning_rate": 7.964034505716476e-07, | |
| "loss": 0.0, | |
| "reward": 0.09095984837040305, | |
| "reward_std": 0.6844956343993545, | |
| "rewards/cosine_scaled_reward": -0.142020083963871, | |
| "rewards/format_reward": 0.3750000037252903, | |
| "step": 193 | |
| }, | |
| { | |
| "completion_length": 3187.3958740234375, | |
| "epoch": 0.22171428571428572, | |
| "grad_norm": 0.057363320142030716, | |
| "kl": 2.8539448976516724e-05, | |
| "learning_rate": 7.93768694627233e-07, | |
| "loss": 0.0, | |
| "reward": 0.888536848127842, | |
| "reward_std": 0.7999596782028675, | |
| "rewards/cosine_scaled_reward": 0.2151017845608294, | |
| "rewards/format_reward": 0.4583333358168602, | |
| "step": 194 | |
| }, | |
| { | |
| "completion_length": 2867.9375610351562, | |
| "epoch": 0.22285714285714286, | |
| "grad_norm": 0.060198407620191574, | |
| "kl": 3.0182301998138428e-05, | |
| "learning_rate": 7.911220577405484e-07, | |
| "loss": 0.0, | |
| "reward": -0.04159543523564935, | |
| "reward_std": 0.4117343556135893, | |
| "rewards/cosine_scaled_reward": -0.2187143824994564, | |
| "rewards/format_reward": 0.3958333432674408, | |
| "step": 195 | |
| }, | |
| { | |
| "completion_length": 3518.2083740234375, | |
| "epoch": 0.224, | |
| "grad_norm": 0.05014675855636597, | |
| "kl": 2.6986002922058105e-05, | |
| "learning_rate": 7.884636689049422e-07, | |
| "loss": 0.0, | |
| "reward": 0.14430725458078086, | |
| "reward_std": 0.6370433792471886, | |
| "rewards/cosine_scaled_reward": -0.084096385166049, | |
| "rewards/format_reward": 0.31250000558793545, | |
| "step": 196 | |
| }, | |
| { | |
| "completion_length": 2920.3333587646484, | |
| "epoch": 0.22514285714285714, | |
| "grad_norm": 0.07932152599096298, | |
| "kl": 3.502890467643738e-05, | |
| "learning_rate": 7.857936576865356e-07, | |
| "loss": 0.0, | |
| "reward": 0.3897266909480095, | |
| "reward_std": 0.7965886853635311, | |
| "rewards/cosine_scaled_reward": 0.007363352924585342, | |
| "rewards/format_reward": 0.3750000111758709, | |
| "step": 197 | |
| }, | |
| { | |
| "completion_length": 2881.3541717529297, | |
| "epoch": 0.22628571428571428, | |
| "grad_norm": 0.06754656136035919, | |
| "kl": 3.559142351150513e-05, | |
| "learning_rate": 7.831121542179086e-07, | |
| "loss": 0.0, | |
| "reward": 0.29642025753855705, | |
| "reward_std": 0.611411452293396, | |
| "rewards/cosine_scaled_reward": -0.01845655031502247, | |
| "rewards/format_reward": 0.3333333432674408, | |
| "step": 198 | |
| }, | |
| { | |
| "completion_length": 3582.5833435058594, | |
| "epoch": 0.22742857142857142, | |
| "grad_norm": 0.04949667677283287, | |
| "kl": 2.1696090698242188e-05, | |
| "learning_rate": 7.804192891917571e-07, | |
| "loss": 0.0, | |
| "reward": -0.546028945595026, | |
| "reward_std": 0.44097290001809597, | |
| "rewards/cosine_scaled_reward": -0.29384780302643776, | |
| "rewards/format_reward": 0.0416666679084301, | |
| "step": 199 | |
| }, | |
| { | |
| "completion_length": 2269.354232788086, | |
| "epoch": 0.22857142857142856, | |
| "grad_norm": 0.0826706662774086, | |
| "kl": 4.459172487258911e-05, | |
| "learning_rate": 7.777151938545235e-07, | |
| "loss": 0.0, | |
| "reward": 0.9302392676472664, | |
| "reward_std": 0.6794853825122118, | |
| "rewards/cosine_scaled_reward": 0.12136960681527853, | |
| "rewards/format_reward": 0.6875000074505806, | |
| "step": 200 | |
| }, | |
| { | |
| "completion_length": 2691.0833892822266, | |
| "epoch": 0.2297142857142857, | |
| "grad_norm": 0.08210141956806183, | |
| "kl": 2.2858381271362305e-05, | |
| "learning_rate": 7.75e-07, | |
| "loss": 0.0, | |
| "reward": 0.9757370799779892, | |
| "reward_std": 0.9528210125863552, | |
| "rewards/cosine_scaled_reward": 0.2066185399889946, | |
| "rewards/format_reward": 0.5625000074505806, | |
| "step": 201 | |
| }, | |
| { | |
| "completion_length": 2290.2916946411133, | |
| "epoch": 0.23085714285714284, | |
| "grad_norm": 0.0913701131939888, | |
| "kl": 2.473965287208557e-05, | |
| "learning_rate": 7.72273839962904e-07, | |
| "loss": 0.0, | |
| "reward": 0.6679499447345734, | |
| "reward_std": 0.4127811063081026, | |
| "rewards/cosine_scaled_reward": 0.09439164772629738, | |
| "rewards/format_reward": 0.4791666716337204, | |
| "step": 202 | |
| }, | |
| { | |
| "completion_length": 3117.875015258789, | |
| "epoch": 0.232, | |
| "grad_norm": 0.06673936545848846, | |
| "kl": 4.2341649532318115e-05, | |
| "learning_rate": 7.695368466124296e-07, | |
| "loss": 0.0, | |
| "reward": -0.02256660722196102, | |
| "reward_std": 0.36536935344338417, | |
| "rewards/cosine_scaled_reward": -0.11544995941221714, | |
| "rewards/format_reward": 0.2083333358168602, | |
| "step": 203 | |
| }, | |
| { | |
| "completion_length": 2442.4583587646484, | |
| "epoch": 0.23314285714285715, | |
| "grad_norm": 0.08854111284017563, | |
| "kl": 4.429370164871216e-05, | |
| "learning_rate": 7.667891533457718e-07, | |
| "loss": 0.0, | |
| "reward": 0.34631434828042984, | |
| "reward_std": 0.5793536752462387, | |
| "rewards/cosine_scaled_reward": -0.09767616726458073, | |
| "rewards/format_reward": 0.5416666679084301, | |
| "step": 204 | |
| }, | |
| { | |
| "completion_length": 2791.104217529297, | |
| "epoch": 0.2342857142857143, | |
| "grad_norm": 0.07635447382926941, | |
| "kl": 2.292729914188385e-05, | |
| "learning_rate": 7.640308940816239e-07, | |
| "loss": 0.0, | |
| "reward": 0.5156640680506825, | |
| "reward_std": 1.2202835828065872, | |
| "rewards/cosine_scaled_reward": -0.002584641450084746, | |
| "rewards/format_reward": 0.5208333469927311, | |
| "step": 205 | |
| }, | |
| { | |
| "completion_length": 2960.145835876465, | |
| "epoch": 0.23542857142857143, | |
| "grad_norm": 0.07889396697282791, | |
| "kl": 5.0440430641174316e-05, | |
| "learning_rate": 7.612622032536507e-07, | |
| "loss": 0.0, | |
| "reward": -0.00487180519849062, | |
| "reward_std": 0.570111021399498, | |
| "rewards/cosine_scaled_reward": -0.15868590073660016, | |
| "rewards/format_reward": 0.31250000558793545, | |
| "step": 206 | |
| }, | |
| { | |
| "completion_length": 3111.8333740234375, | |
| "epoch": 0.23657142857142857, | |
| "grad_norm": 0.07035059481859207, | |
| "kl": 3.085436765104532e-05, | |
| "learning_rate": 7.584832158039378e-07, | |
| "loss": 0.0, | |
| "reward": -0.04450271651148796, | |
| "reward_std": 0.7633909024298191, | |
| "rewards/cosine_scaled_reward": -0.2305846947710961, | |
| "rewards/format_reward": 0.4166666679084301, | |
| "step": 207 | |
| }, | |
| { | |
| "completion_length": 2721.2916870117188, | |
| "epoch": 0.2377142857142857, | |
| "grad_norm": 0.06604988127946854, | |
| "kl": 3.231130540370941e-05, | |
| "learning_rate": 7.556940671764124e-07, | |
| "loss": 0.0, | |
| "reward": 0.25559964030981064, | |
| "reward_std": 0.386314133182168, | |
| "rewards/cosine_scaled_reward": -0.05970017798244953, | |
| "rewards/format_reward": 0.375, | |
| "step": 208 | |
| }, | |
| { | |
| "completion_length": 2529.1875534057617, | |
| "epoch": 0.23885714285714285, | |
| "grad_norm": 0.10173772275447845, | |
| "kl": 2.830103039741516e-05, | |
| "learning_rate": 7.528948933102438e-07, | |
| "loss": 0.0, | |
| "reward": 0.40500495955348015, | |
| "reward_std": 0.6829135082662106, | |
| "rewards/cosine_scaled_reward": -0.037080854759551585, | |
| "rewards/format_reward": 0.4791666828095913, | |
| "step": 209 | |
| }, | |
| { | |
| "completion_length": 2737.875030517578, | |
| "epoch": 0.24, | |
| "grad_norm": 0.07160615921020508, | |
| "kl": 2.4665147066116333e-05, | |
| "learning_rate": 7.500858306332172e-07, | |
| "loss": 0.0, | |
| "reward": 0.4126194640994072, | |
| "reward_std": 0.5888217613101006, | |
| "rewards/cosine_scaled_reward": 0.008393049472942948, | |
| "rewards/format_reward": 0.39583334513008595, | |
| "step": 210 | |
| }, | |
| { | |
| "completion_length": 2676.000015258789, | |
| "epoch": 0.24114285714285713, | |
| "grad_norm": 0.06261998414993286, | |
| "kl": 2.3281201720237732e-05, | |
| "learning_rate": 7.472670160550848e-07, | |
| "loss": 0.0, | |
| "reward": 0.4131604013964534, | |
| "reward_std": 0.5786217153072357, | |
| "rewards/cosine_scaled_reward": -0.022586457431316376, | |
| "rewards/format_reward": 0.45833334140479565, | |
| "step": 211 | |
| }, | |
| { | |
| "completion_length": 2151.8333435058594, | |
| "epoch": 0.2422857142857143, | |
| "grad_norm": 0.10325627028942108, | |
| "kl": 1.8581748008728027e-05, | |
| "learning_rate": 7.444385869608921e-07, | |
| "loss": 0.0, | |
| "reward": 0.47507104836404324, | |
| "reward_std": 0.4877533782273531, | |
| "rewards/cosine_scaled_reward": -0.022881127893924713, | |
| "rewards/format_reward": 0.520833333954215, | |
| "step": 212 | |
| }, | |
| { | |
| "completion_length": 2051.0416870117188, | |
| "epoch": 0.24342857142857144, | |
| "grad_norm": 0.11275593191385269, | |
| "kl": 3.139302134513855e-05, | |
| "learning_rate": 7.416006812042827e-07, | |
| "loss": 0.0, | |
| "reward": 0.7439306043088436, | |
| "reward_std": 0.6277007050812244, | |
| "rewards/cosine_scaled_reward": 0.028215290512889624, | |
| "rewards/format_reward": 0.6875, | |
| "step": 213 | |
| }, | |
| { | |
| "completion_length": 2796.8750534057617, | |
| "epoch": 0.24457142857142858, | |
| "grad_norm": 0.07322173565626144, | |
| "kl": 3.0666589736938477e-05, | |
| "learning_rate": 7.387534371007797e-07, | |
| "loss": 0.0, | |
| "reward": 0.4464948996901512, | |
| "reward_std": 0.8664285503327847, | |
| "rewards/cosine_scaled_reward": 0.01491411030292511, | |
| "rewards/format_reward": 0.4166666753590107, | |
| "step": 214 | |
| }, | |
| { | |
| "completion_length": 2475.416679382324, | |
| "epoch": 0.24571428571428572, | |
| "grad_norm": 0.10743594914674759, | |
| "kl": 4.0121376514434814e-05, | |
| "learning_rate": 7.358969934210438e-07, | |
| "loss": 0.0, | |
| "reward": 0.12009689398109913, | |
| "reward_std": 0.5320453681051731, | |
| "rewards/cosine_scaled_reward": -0.16911823488771915, | |
| "rewards/format_reward": 0.4583333395421505, | |
| "step": 215 | |
| }, | |
| { | |
| "completion_length": 2033.6667022705078, | |
| "epoch": 0.24685714285714286, | |
| "grad_norm": 0.1505194753408432, | |
| "kl": 7.011741399765015e-05, | |
| "learning_rate": 7.330314893841101e-07, | |
| "loss": 0.0, | |
| "reward": 0.578947626054287, | |
| "reward_std": 0.5741180796176195, | |
| "rewards/cosine_scaled_reward": -0.002192860469222069, | |
| "rewards/format_reward": 0.5833333414047956, | |
| "step": 216 | |
| }, | |
| { | |
| "completion_length": 2716.354202270508, | |
| "epoch": 0.248, | |
| "grad_norm": 0.06389462202787399, | |
| "kl": 2.299342304468155e-05, | |
| "learning_rate": 7.301570646506027e-07, | |
| "loss": 0.0, | |
| "reward": 0.23419375042431056, | |
| "reward_std": 0.9428692683577538, | |
| "rewards/cosine_scaled_reward": -0.1224864674732089, | |
| "rewards/format_reward": 0.4791666753590107, | |
| "step": 217 | |
| }, | |
| { | |
| "completion_length": 2871.395866394043, | |
| "epoch": 0.24914285714285714, | |
| "grad_norm": 0.07211603969335556, | |
| "kl": 1.8540769815444946e-05, | |
| "learning_rate": 7.27273859315928e-07, | |
| "loss": 0.0, | |
| "reward": 0.2273400453850627, | |
| "reward_std": 0.9102189503610134, | |
| "rewards/cosine_scaled_reward": -0.07382998894900084, | |
| "rewards/format_reward": 0.37500000186264515, | |
| "step": 218 | |
| }, | |
| { | |
| "completion_length": 2432.395851135254, | |
| "epoch": 0.2502857142857143, | |
| "grad_norm": 0.10301367193460464, | |
| "kl": 4.46811318397522e-05, | |
| "learning_rate": 7.243820139034464e-07, | |
| "loss": 0.0, | |
| "reward": 0.3928310014307499, | |
| "reward_std": 0.6771155875176191, | |
| "rewards/cosine_scaled_reward": -0.043167827650904655, | |
| "rewards/format_reward": 0.4791666716337204, | |
| "step": 219 | |
| }, | |
| { | |
| "completion_length": 2698.2500228881836, | |
| "epoch": 0.25142857142857145, | |
| "grad_norm": 0.10218124836683273, | |
| "kl": 2.3185275495052338e-05, | |
| "learning_rate": 7.214816693576234e-07, | |
| "loss": 0.0, | |
| "reward": -0.11642071604728699, | |
| "reward_std": 0.3042832724750042, | |
| "rewards/cosine_scaled_reward": -0.23529369570314884, | |
| "rewards/format_reward": 0.3541666716337204, | |
| "step": 220 | |
| }, | |
| { | |
| "completion_length": 2353.9167098999023, | |
| "epoch": 0.25257142857142856, | |
| "grad_norm": 0.07714515179395676, | |
| "kl": 3.851950168609619e-05, | |
| "learning_rate": 7.185729670371604e-07, | |
| "loss": 0.0, | |
| "reward": 0.48144758492708206, | |
| "reward_std": 0.625980376615189, | |
| "rewards/cosine_scaled_reward": -0.03010955359786749, | |
| "rewards/format_reward": 0.5416666772216558, | |
| "step": 221 | |
| }, | |
| { | |
| "completion_length": 2297.5833740234375, | |
| "epoch": 0.2537142857142857, | |
| "grad_norm": 0.09102893620729446, | |
| "kl": 4.67449426651001e-05, | |
| "learning_rate": 7.156560487081051e-07, | |
| "loss": 0.0, | |
| "reward": 0.6812405036762357, | |
| "reward_std": 0.6718111708760262, | |
| "rewards/cosine_scaled_reward": 0.06978693418204784, | |
| "rewards/format_reward": 0.5416666846722364, | |
| "step": 222 | |
| }, | |
| { | |
| "completion_length": 2456.333366394043, | |
| "epoch": 0.25485714285714284, | |
| "grad_norm": 0.07353059202432632, | |
| "kl": 1.781061291694641e-05, | |
| "learning_rate": 7.127310565369415e-07, | |
| "loss": 0.0, | |
| "reward": 0.43085432425141335, | |
| "reward_std": 0.6725905947387218, | |
| "rewards/cosine_scaled_reward": -0.013739500194787979, | |
| "rewards/format_reward": 0.45833333395421505, | |
| "step": 223 | |
| }, | |
| { | |
| "completion_length": 3251.541717529297, | |
| "epoch": 0.256, | |
| "grad_norm": 0.05879911035299301, | |
| "kl": 1.8991529941558838e-05, | |
| "learning_rate": 7.097981330836616e-07, | |
| "loss": 0.0, | |
| "reward": -0.17946593277156353, | |
| "reward_std": 0.715836551040411, | |
| "rewards/cosine_scaled_reward": -0.18348297663033009, | |
| "rewards/format_reward": 0.1875000037252903, | |
| "step": 224 | |
| }, | |
| { | |
| "completion_length": 3110.250011444092, | |
| "epoch": 0.2571428571428571, | |
| "grad_norm": 0.0750289261341095, | |
| "kl": 2.0813196897506714e-05, | |
| "learning_rate": 7.068574212948169e-07, | |
| "loss": 0.0, | |
| "reward": -0.15115100768161938, | |
| "reward_std": 0.5358757842332125, | |
| "rewards/cosine_scaled_reward": -0.19015884585678577, | |
| "rewards/format_reward": 0.22916666977107525, | |
| "step": 225 | |
| }, | |
| { | |
| "completion_length": 2903.666717529297, | |
| "epoch": 0.2582857142857143, | |
| "grad_norm": 0.06739815324544907, | |
| "kl": 2.2884458303451538e-05, | |
| "learning_rate": 7.039090644965509e-07, | |
| "loss": 0.0, | |
| "reward": 0.351996093057096, | |
| "reward_std": 0.8762952201068401, | |
| "rewards/cosine_scaled_reward": -0.0010852883569896221, | |
| "rewards/format_reward": 0.3541666753590107, | |
| "step": 226 | |
| }, | |
| { | |
| "completion_length": 2081.250045776367, | |
| "epoch": 0.25942857142857145, | |
| "grad_norm": 0.10184264183044434, | |
| "kl": 2.9000453650951385e-05, | |
| "learning_rate": 7.009532063876148e-07, | |
| "loss": 0.0, | |
| "reward": 0.402258925139904, | |
| "reward_std": 0.6547506116330624, | |
| "rewards/cosine_scaled_reward": -0.13220388256013393, | |
| "rewards/format_reward": 0.6666666716337204, | |
| "step": 227 | |
| }, | |
| { | |
| "completion_length": 2290.68754196167, | |
| "epoch": 0.26057142857142856, | |
| "grad_norm": 0.0886116549372673, | |
| "kl": 1.9509345293045044e-05, | |
| "learning_rate": 6.979899910323624e-07, | |
| "loss": 0.0, | |
| "reward": 0.6281102709472179, | |
| "reward_std": 0.7301202099770308, | |
| "rewards/cosine_scaled_reward": 0.05363846756517887, | |
| "rewards/format_reward": 0.5208333376795053, | |
| "step": 228 | |
| }, | |
| { | |
| "completion_length": 3176.3333587646484, | |
| "epoch": 0.26171428571428573, | |
| "grad_norm": 0.06190153956413269, | |
| "kl": 4.74490225315094e-05, | |
| "learning_rate": 6.950195628537299e-07, | |
| "loss": 0.0, | |
| "reward": 0.13336317241191864, | |
| "reward_std": 0.5335109885782003, | |
| "rewards/cosine_scaled_reward": -0.027068420546129346, | |
| "rewards/format_reward": 0.18750000186264515, | |
| "step": 229 | |
| }, | |
| { | |
| "completion_length": 3118.770866394043, | |
| "epoch": 0.26285714285714284, | |
| "grad_norm": 0.06409095972776413, | |
| "kl": 1.1723721399903297e-05, | |
| "learning_rate": 6.920420666261961e-07, | |
| "loss": 0.0, | |
| "reward": 0.13207979500293732, | |
| "reward_std": 0.7329925578087568, | |
| "rewards/cosine_scaled_reward": -0.09021011181175709, | |
| "rewards/format_reward": 0.3125000037252903, | |
| "step": 230 | |
| }, | |
| { | |
| "completion_length": 2592.645851135254, | |
| "epoch": 0.264, | |
| "grad_norm": 0.07948760688304901, | |
| "kl": 1.0892748832702637e-05, | |
| "learning_rate": 6.890576474687263e-07, | |
| "loss": 0.0, | |
| "reward": 0.21768657490611076, | |
| "reward_std": 0.6145608052611351, | |
| "rewards/cosine_scaled_reward": -0.09949005115777254, | |
| "rewards/format_reward": 0.4166666679084301, | |
| "step": 231 | |
| }, | |
| { | |
| "completion_length": 3199.437530517578, | |
| "epoch": 0.2651428571428571, | |
| "grad_norm": 0.0610453225672245, | |
| "kl": 2.2662803530693054e-05, | |
| "learning_rate": 6.860664508377001e-07, | |
| "loss": 0.0, | |
| "reward": -0.07720962353050709, | |
| "reward_std": 0.5662393067032099, | |
| "rewards/cosine_scaled_reward": -0.15318814292550087, | |
| "rewards/format_reward": 0.2291666716337204, | |
| "step": 232 | |
| }, | |
| { | |
| "completion_length": 2527.3958740234375, | |
| "epoch": 0.2662857142857143, | |
| "grad_norm": 0.07935723662376404, | |
| "kl": 2.217913424829021e-05, | |
| "learning_rate": 6.83068622519821e-07, | |
| "loss": 0.0, | |
| "reward": 0.5132673047482967, | |
| "reward_std": 1.1579342857003212, | |
| "rewards/cosine_scaled_reward": -0.0037830215878784657, | |
| "rewards/format_reward": 0.5208333358168602, | |
| "step": 233 | |
| }, | |
| { | |
| "completion_length": 2746.4166831970215, | |
| "epoch": 0.2674285714285714, | |
| "grad_norm": 0.09484535455703735, | |
| "kl": 3.357976675033569e-05, | |
| "learning_rate": 6.800643086250121e-07, | |
| "loss": 0.0, | |
| "reward": 0.18169519677758217, | |
| "reward_std": 0.5667401887476444, | |
| "rewards/cosine_scaled_reward": -0.09665239416062832, | |
| "rewards/format_reward": 0.3750000037252903, | |
| "step": 234 | |
| }, | |
| { | |
| "completion_length": 2363.833354949951, | |
| "epoch": 0.26857142857142857, | |
| "grad_norm": 0.12131030112504959, | |
| "kl": 2.1457672119140625e-05, | |
| "learning_rate": 6.770536555792944e-07, | |
| "loss": 0.0, | |
| "reward": 0.6644677482545376, | |
| "reward_std": 0.663248460739851, | |
| "rewards/cosine_scaled_reward": 0.10306720063090324, | |
| "rewards/format_reward": 0.4583333358168602, | |
| "step": 235 | |
| }, | |
| { | |
| "completion_length": 2886.145835876465, | |
| "epoch": 0.26971428571428574, | |
| "grad_norm": 0.06665312498807907, | |
| "kl": 2.072751522064209e-05, | |
| "learning_rate": 6.740368101176495e-07, | |
| "loss": 0.0, | |
| "reward": 0.25276677403599024, | |
| "reward_std": 0.6072818320244551, | |
| "rewards/cosine_scaled_reward": -0.07153328275308013, | |
| "rewards/format_reward": 0.3958333358168602, | |
| "step": 236 | |
| }, | |
| { | |
| "completion_length": 2779.812515258789, | |
| "epoch": 0.27085714285714285, | |
| "grad_norm": 0.07763387262821198, | |
| "kl": 3.341119736433029e-05, | |
| "learning_rate": 6.710139192768694e-07, | |
| "loss": 0.0, | |
| "reward": 0.2074728086590767, | |
| "reward_std": 0.3905568104237318, | |
| "rewards/cosine_scaled_reward": -0.08376359008252621, | |
| "rewards/format_reward": 0.375, | |
| "step": 237 | |
| }, | |
| { | |
| "completion_length": 3254.2708740234375, | |
| "epoch": 0.272, | |
| "grad_norm": 0.05728929117321968, | |
| "kl": 1.5564262866973877e-05, | |
| "learning_rate": 6.679851303883891e-07, | |
| "loss": 0.0, | |
| "reward": 0.7523380443453789, | |
| "reward_std": 0.9776927344501019, | |
| "rewards/cosine_scaled_reward": 0.11575235472992063, | |
| "rewards/format_reward": 0.5208333507180214, | |
| "step": 238 | |
| }, | |
| { | |
| "completion_length": 1804.4375228881836, | |
| "epoch": 0.27314285714285713, | |
| "grad_norm": 0.10242630541324615, | |
| "kl": 3.166869282722473e-05, | |
| "learning_rate": 6.649505910711058e-07, | |
| "loss": 0.0, | |
| "reward": 1.1699023116379976, | |
| "reward_std": 0.7132734637707472, | |
| "rewards/cosine_scaled_reward": 0.22036781534552574, | |
| "rewards/format_reward": 0.7291666772216558, | |
| "step": 239 | |
| }, | |
| { | |
| "completion_length": 3074.6875228881836, | |
| "epoch": 0.2742857142857143, | |
| "grad_norm": 0.07804732024669647, | |
| "kl": 1.6137957572937012e-05, | |
| "learning_rate": 6.619104492241847e-07, | |
| "loss": 0.0, | |
| "reward": -0.13834909722208977, | |
| "reward_std": 0.416837802156806, | |
| "rewards/cosine_scaled_reward": -0.20459122210741043, | |
| "rewards/format_reward": 0.27083333395421505, | |
| "step": 240 | |
| }, | |
| { | |
| "completion_length": 3319.875030517578, | |
| "epoch": 0.2754285714285714, | |
| "grad_norm": 0.0568733736872673, | |
| "kl": 3.516674041748047e-06, | |
| "learning_rate": 6.588648530198504e-07, | |
| "loss": 0.0, | |
| "reward": 0.16049459762871265, | |
| "reward_std": 0.55229333601892, | |
| "rewards/cosine_scaled_reward": -0.09683603886514902, | |
| "rewards/format_reward": 0.3541666716337204, | |
| "step": 241 | |
| }, | |
| { | |
| "completion_length": 2465.541702270508, | |
| "epoch": 0.2765714285714286, | |
| "grad_norm": 0.11555071920156479, | |
| "kl": 5.5652111768722534e-05, | |
| "learning_rate": 6.558139508961654e-07, | |
| "loss": 0.0, | |
| "reward": 0.2516896640881896, | |
| "reward_std": 0.7441490637138486, | |
| "rewards/cosine_scaled_reward": -0.11373850936070085, | |
| "rewards/format_reward": 0.4791666753590107, | |
| "step": 242 | |
| }, | |
| { | |
| "completion_length": 2875.416702270508, | |
| "epoch": 0.2777142857142857, | |
| "grad_norm": 0.06771781295537949, | |
| "kl": 1.3776123523712158e-05, | |
| "learning_rate": 6.527578915497951e-07, | |
| "loss": 0.0, | |
| "reward": 0.2745806626044214, | |
| "reward_std": 0.7221901593729854, | |
| "rewards/cosine_scaled_reward": -0.05020967312157154, | |
| "rewards/format_reward": 0.37500000558793545, | |
| "step": 243 | |
| }, | |
| { | |
| "completion_length": 3019.458396911621, | |
| "epoch": 0.27885714285714286, | |
| "grad_norm": 0.07095371186733246, | |
| "kl": 2.15525105886627e-05, | |
| "learning_rate": 6.496968239287603e-07, | |
| "loss": 0.0, | |
| "reward": 0.11198623012751341, | |
| "reward_std": 0.8036079779267311, | |
| "rewards/cosine_scaled_reward": -0.10025688190944493, | |
| "rewards/format_reward": 0.3125000074505806, | |
| "step": 244 | |
| }, | |
| { | |
| "completion_length": 2509.479217529297, | |
| "epoch": 0.28, | |
| "grad_norm": 0.07597236335277557, | |
| "kl": 2.3078173398971558e-05, | |
| "learning_rate": 6.466308972251785e-07, | |
| "loss": 0.0, | |
| "reward": 0.20437275152653456, | |
| "reward_std": 0.8602081090211868, | |
| "rewards/cosine_scaled_reward": -0.13739696645643562, | |
| "rewards/format_reward": 0.479166679084301, | |
| "step": 245 | |
| }, | |
| { | |
| "completion_length": 2831.291717529297, | |
| "epoch": 0.28114285714285714, | |
| "grad_norm": 0.06382181495428085, | |
| "kl": 1.4416873455047607e-05, | |
| "learning_rate": 6.435602608679916e-07, | |
| "loss": 0.0, | |
| "reward": 0.2353922687470913, | |
| "reward_std": 0.9306090399622917, | |
| "rewards/cosine_scaled_reward": -0.09063721133861691, | |
| "rewards/format_reward": 0.41666667722165585, | |
| "step": 246 | |
| }, | |
| { | |
| "completion_length": 3141.5417098999023, | |
| "epoch": 0.2822857142857143, | |
| "grad_norm": 0.10150086134672165, | |
| "kl": 3.682821989059448e-05, | |
| "learning_rate": 6.404850645156841e-07, | |
| "loss": 0.0, | |
| "reward": 0.04184865241404623, | |
| "reward_std": 0.7470915205776691, | |
| "rewards/cosine_scaled_reward": -0.1144923409447074, | |
| "rewards/format_reward": 0.2708333358168602, | |
| "step": 247 | |
| }, | |
| { | |
| "completion_length": 2228.3541831970215, | |
| "epoch": 0.2834285714285714, | |
| "grad_norm": 0.09983125329017639, | |
| "kl": 2.7175061404705048e-05, | |
| "learning_rate": 6.374054580489873e-07, | |
| "loss": 0.0, | |
| "reward": 0.8679499141871929, | |
| "reward_std": 0.519476355984807, | |
| "rewards/cosine_scaled_reward": 0.18397497944533825, | |
| "rewards/format_reward": 0.5, | |
| "step": 248 | |
| }, | |
| { | |
| "completion_length": 2049.9791831970215, | |
| "epoch": 0.2845714285714286, | |
| "grad_norm": 0.09904063493013382, | |
| "kl": 2.1678395569324493e-05, | |
| "learning_rate": 6.343215915635761e-07, | |
| "loss": 0.0, | |
| "reward": 0.6857778578996658, | |
| "reward_std": 0.562960809096694, | |
| "rewards/cosine_scaled_reward": 0.05122227035462856, | |
| "rewards/format_reward": 0.5833333358168602, | |
| "step": 249 | |
| }, | |
| { | |
| "completion_length": 2611.3958740234375, | |
| "epoch": 0.2857142857142857, | |
| "grad_norm": 0.08777017146348953, | |
| "kl": 4.871189594268799e-05, | |
| "learning_rate": 6.31233615362752e-07, | |
| "loss": 0.0, | |
| "reward": 0.21096007153391838, | |
| "reward_std": 0.5374563187360764, | |
| "rewards/cosine_scaled_reward": -0.10285330004990101, | |
| "rewards/format_reward": 0.4166666753590107, | |
| "step": 250 | |
| }, | |
| { | |
| "completion_length": 2224.041675567627, | |
| "epoch": 0.28685714285714287, | |
| "grad_norm": 0.12650950253009796, | |
| "kl": 4.5960769057273865e-05, | |
| "learning_rate": 6.281416799501187e-07, | |
| "loss": 0.0, | |
| "reward": 0.6370383389294147, | |
| "reward_std": 0.8823517276905477, | |
| "rewards/cosine_scaled_reward": 0.016435828525573015, | |
| "rewards/format_reward": 0.6041666679084301, | |
| "step": 251 | |
| }, | |
| { | |
| "completion_length": 2770.9583435058594, | |
| "epoch": 0.288, | |
| "grad_norm": 0.09009101241827011, | |
| "kl": 3.509595990180969e-05, | |
| "learning_rate": 6.25045936022246e-07, | |
| "loss": 0.0, | |
| "reward": 0.16730662202462554, | |
| "reward_std": 0.625939853489399, | |
| "rewards/cosine_scaled_reward": -0.1142633780837059, | |
| "rewards/format_reward": 0.3958333358168602, | |
| "step": 252 | |
| }, | |
| { | |
| "completion_length": 2892.0000228881836, | |
| "epoch": 0.28914285714285715, | |
| "grad_norm": 0.07430292665958405, | |
| "kl": 1.961551606655121e-05, | |
| "learning_rate": 6.219465344613258e-07, | |
| "loss": 0.0, | |
| "reward": 0.6220215447247028, | |
| "reward_std": 0.620788186788559, | |
| "rewards/cosine_scaled_reward": 0.08184408023953438, | |
| "rewards/format_reward": 0.45833333395421505, | |
| "step": 253 | |
| }, | |
| { | |
| "completion_length": 2727.312557220459, | |
| "epoch": 0.29028571428571426, | |
| "grad_norm": 0.09726808220148087, | |
| "kl": 3.484264016151428e-05, | |
| "learning_rate": 6.188436263278172e-07, | |
| "loss": 0.0, | |
| "reward": 0.06352332048118114, | |
| "reward_std": 0.7857347205281258, | |
| "rewards/cosine_scaled_reward": -0.16615500673651695, | |
| "rewards/format_reward": 0.39583333767950535, | |
| "step": 254 | |
| }, | |
| { | |
| "completion_length": 3086.6458587646484, | |
| "epoch": 0.2914285714285714, | |
| "grad_norm": 0.07653625309467316, | |
| "kl": 2.278760075569153e-05, | |
| "learning_rate": 6.157373628530852e-07, | |
| "loss": 0.0, | |
| "reward": -0.0511588528752327, | |
| "reward_std": 0.818204790353775, | |
| "rewards/cosine_scaled_reward": -0.16099609807133675, | |
| "rewards/format_reward": 0.2708333358168602, | |
| "step": 255 | |
| }, | |
| { | |
| "completion_length": 3192.5833740234375, | |
| "epoch": 0.2925714285714286, | |
| "grad_norm": 0.06298832595348358, | |
| "kl": 2.4273991584777832e-05, | |
| "learning_rate": 6.126278954320294e-07, | |
| "loss": 0.0, | |
| "reward": 0.40953988255932927, | |
| "reward_std": 1.1198224946856499, | |
| "rewards/cosine_scaled_reward": 0.02768661454319954, | |
| "rewards/format_reward": 0.3541666716337204, | |
| "step": 256 | |
| }, | |
| { | |
| "completion_length": 3227.375045776367, | |
| "epoch": 0.2937142857142857, | |
| "grad_norm": 0.06461450457572937, | |
| "kl": 1.7869286239147186e-05, | |
| "learning_rate": 6.095153756157051e-07, | |
| "loss": 0.0, | |
| "reward": 0.2203599140048027, | |
| "reward_std": 0.6862583365291357, | |
| "rewards/cosine_scaled_reward": 0.006013283971697092, | |
| "rewards/format_reward": 0.20833333395421505, | |
| "step": 257 | |
| }, | |
| { | |
| "completion_length": 3478.2708435058594, | |
| "epoch": 0.2948571428571429, | |
| "grad_norm": 0.05246291309595108, | |
| "kl": 1.8885359168052673e-05, | |
| "learning_rate": 6.06399955103937e-07, | |
| "loss": 0.0, | |
| "reward": 0.08181272400543094, | |
| "reward_std": 0.787172719836235, | |
| "rewards/cosine_scaled_reward": -0.10492698103189468, | |
| "rewards/format_reward": 0.29166667349636555, | |
| "step": 258 | |
| }, | |
| { | |
| "completion_length": 2911.0208435058594, | |
| "epoch": 0.296, | |
| "grad_norm": 0.08555354177951813, | |
| "kl": 2.0079314708709717e-05, | |
| "learning_rate": 6.032817857379256e-07, | |
| "loss": 0.0, | |
| "reward": 0.2695394828915596, | |
| "reward_std": 0.443267859518528, | |
| "rewards/cosine_scaled_reward": -0.01106359250843525, | |
| "rewards/format_reward": 0.2916666679084301, | |
| "step": 259 | |
| }, | |
| { | |
| "completion_length": 2160.895866394043, | |
| "epoch": 0.29714285714285715, | |
| "grad_norm": 0.14956389367580414, | |
| "kl": 2.4473294615745544e-05, | |
| "learning_rate": 6.001610194928464e-07, | |
| "loss": 0.0, | |
| "reward": 0.8935365863144398, | |
| "reward_std": 0.6962429541163146, | |
| "rewards/cosine_scaled_reward": 0.1759349536150694, | |
| "rewards/format_reward": 0.5416666716337204, | |
| "step": 260 | |
| }, | |
| { | |
| "completion_length": 3014.7291870117188, | |
| "epoch": 0.29828571428571427, | |
| "grad_norm": 0.07903124392032623, | |
| "kl": 2.200435847043991e-05, | |
| "learning_rate": 5.97037808470444e-07, | |
| "loss": 0.0, | |
| "reward": -0.15301374904811382, | |
| "reward_std": 0.4762358646839857, | |
| "rewards/cosine_scaled_reward": -0.21192354569211602, | |
| "rewards/format_reward": 0.27083333767950535, | |
| "step": 261 | |
| }, | |
| { | |
| "completion_length": 3212.5625, | |
| "epoch": 0.29942857142857143, | |
| "grad_norm": 0.07498282194137573, | |
| "kl": 3.1574396416544914e-05, | |
| "learning_rate": 5.939123048916173e-07, | |
| "loss": 0.0, | |
| "reward": -0.2782311188057065, | |
| "reward_std": 0.46266084536910057, | |
| "rewards/cosine_scaled_reward": -0.2224489003419876, | |
| "rewards/format_reward": 0.1666666679084301, | |
| "step": 262 | |
| }, | |
| { | |
| "completion_length": 2679.5625228881836, | |
| "epoch": 0.30057142857142854, | |
| "grad_norm": 0.08766698092222214, | |
| "kl": 2.225919160991907e-05, | |
| "learning_rate": 5.907846610890011e-07, | |
| "loss": 0.0, | |
| "reward": 0.14072518423199654, | |
| "reward_std": 0.6760160718113184, | |
| "rewards/cosine_scaled_reward": -0.12755409069359303, | |
| "rewards/format_reward": 0.3958333395421505, | |
| "step": 263 | |
| }, | |
| { | |
| "completion_length": 2920.625015258789, | |
| "epoch": 0.3017142857142857, | |
| "grad_norm": 0.06199931353330612, | |
| "kl": 1.263665035367012e-05, | |
| "learning_rate": 5.87655029499542e-07, | |
| "loss": 0.0, | |
| "reward": 0.42131929902825505, | |
| "reward_std": 0.4519203417003155, | |
| "rewards/cosine_scaled_reward": -0.00809035450220108, | |
| "rewards/format_reward": 0.43750000558793545, | |
| "step": 264 | |
| }, | |
| { | |
| "completion_length": 2057.0000228881836, | |
| "epoch": 0.3028571428571429, | |
| "grad_norm": 0.11336781084537506, | |
| "kl": 2.76956707239151e-05, | |
| "learning_rate": 5.845235626570683e-07, | |
| "loss": 0.0, | |
| "reward": 0.8533403426408768, | |
| "reward_std": 0.7286555413156748, | |
| "rewards/cosine_scaled_reward": 0.11417016061022878, | |
| "rewards/format_reward": 0.6250000093132257, | |
| "step": 265 | |
| }, | |
| { | |
| "completion_length": 3211.4791717529297, | |
| "epoch": 0.304, | |
| "grad_norm": 0.05811421200633049, | |
| "kl": 1.2663193047046661e-05, | |
| "learning_rate": 5.813904131848564e-07, | |
| "loss": 0.0, | |
| "reward": -0.021148244850337505, | |
| "reward_std": 0.34333381056785583, | |
| "rewards/cosine_scaled_reward": -0.09390746429562569, | |
| "rewards/format_reward": 0.1666666679084301, | |
| "step": 266 | |
| }, | |
| { | |
| "completion_length": 3201.6041717529297, | |
| "epoch": 0.30514285714285716, | |
| "grad_norm": 0.05871862918138504, | |
| "kl": -9.872019290924072e-08, | |
| "learning_rate": 5.78255733788191e-07, | |
| "loss": -0.0, | |
| "reward": -0.15571419894695282, | |
| "reward_std": 0.5217737518250942, | |
| "rewards/cosine_scaled_reward": -0.17160710133612156, | |
| "rewards/format_reward": 0.1875, | |
| "step": 267 | |
| }, | |
| { | |
| "completion_length": 2433.083381652832, | |
| "epoch": 0.3062857142857143, | |
| "grad_norm": 0.1019405871629715, | |
| "kl": 3.275275230407715e-05, | |
| "learning_rate": 5.751196772469237e-07, | |
| "loss": 0.0, | |
| "reward": 0.2796001695096493, | |
| "reward_std": 0.7619174681603909, | |
| "rewards/cosine_scaled_reward": -0.07894991664215922, | |
| "rewards/format_reward": 0.43750000558793545, | |
| "step": 268 | |
| }, | |
| { | |
| "completion_length": 3138.68754196167, | |
| "epoch": 0.30742857142857144, | |
| "grad_norm": 0.07884818315505981, | |
| "kl": 2.5736168026924133e-05, | |
| "learning_rate": 5.71982396408026e-07, | |
| "loss": 0.0, | |
| "reward": -0.08517349883913994, | |
| "reward_std": 0.7708080522716045, | |
| "rewards/cosine_scaled_reward": -0.1675867633894086, | |
| "rewards/format_reward": 0.2500000037252903, | |
| "step": 269 | |
| }, | |
| { | |
| "completion_length": 2893.9166984558105, | |
| "epoch": 0.30857142857142855, | |
| "grad_norm": 0.08506487309932709, | |
| "kl": 3.803183790296316e-05, | |
| "learning_rate": 5.688440441781398e-07, | |
| "loss": 0.0, | |
| "reward": 0.2290789857506752, | |
| "reward_std": 0.9645229317247868, | |
| "rewards/cosine_scaled_reward": -0.0625438541173935, | |
| "rewards/format_reward": 0.354166679084301, | |
| "step": 270 | |
| }, | |
| { | |
| "completion_length": 2348.854202270508, | |
| "epoch": 0.3097142857142857, | |
| "grad_norm": 0.08082716166973114, | |
| "kl": 1.085922122001648e-05, | |
| "learning_rate": 5.657047735161255e-07, | |
| "loss": 0.0, | |
| "reward": 0.869732528924942, | |
| "reward_std": 0.6329166023060679, | |
| "rewards/cosine_scaled_reward": 0.0702829168876633, | |
| "rewards/format_reward": 0.7291666772216558, | |
| "step": 271 | |
| }, | |
| { | |
| "completion_length": 2840.7708740234375, | |
| "epoch": 0.31085714285714283, | |
| "grad_norm": 0.08175045996904373, | |
| "kl": 2.147303894162178e-05, | |
| "learning_rate": 5.625647374256061e-07, | |
| "loss": 0.0, | |
| "reward": 0.07121117413043976, | |
| "reward_std": 0.5003596153110266, | |
| "rewards/cosine_scaled_reward": -0.13106108270585537, | |
| "rewards/format_reward": 0.33333333395421505, | |
| "step": 272 | |
| }, | |
| { | |
| "completion_length": 2674.354217529297, | |
| "epoch": 0.312, | |
| "grad_norm": 0.07014412432909012, | |
| "kl": -4.263129085302353e-06, | |
| "learning_rate": 5.594240889475106e-07, | |
| "loss": -0.0, | |
| "reward": 0.6560160380322486, | |
| "reward_std": 0.6600966192781925, | |
| "rewards/cosine_scaled_reward": 0.1196746751666069, | |
| "rewards/format_reward": 0.4166666679084301, | |
| "step": 273 | |
| }, | |
| { | |
| "completion_length": 1955.708339691162, | |
| "epoch": 0.31314285714285717, | |
| "grad_norm": 0.11805617064237595, | |
| "kl": 4.6776141971349716e-05, | |
| "learning_rate": 5.562829811526154e-07, | |
| "loss": 0.0, | |
| "reward": 0.6841768575832248, | |
| "reward_std": 0.7843492720276117, | |
| "rewards/cosine_scaled_reward": 0.0191717566922307, | |
| "rewards/format_reward": 0.6458333395421505, | |
| "step": 274 | |
| }, | |
| { | |
| "completion_length": 2314.3541946411133, | |
| "epoch": 0.3142857142857143, | |
| "grad_norm": 0.0782981738448143, | |
| "kl": 2.05114483833313e-05, | |
| "learning_rate": 5.531415671340826e-07, | |
| "loss": 0.0, | |
| "reward": 0.6708630304783583, | |
| "reward_std": 0.7477290779352188, | |
| "rewards/cosine_scaled_reward": 0.054181501967832446, | |
| "rewards/format_reward": 0.5625000111758709, | |
| "step": 275 | |
| }, | |
| { | |
| "completion_length": 2753.791702270508, | |
| "epoch": 0.31542857142857145, | |
| "grad_norm": 0.0737811028957367, | |
| "kl": 3.1892210245132446e-05, | |
| "learning_rate": 5.5e-07, | |
| "loss": 0.0, | |
| "reward": 0.6149183064699173, | |
| "reward_std": 0.7014920264482498, | |
| "rewards/cosine_scaled_reward": 0.07829250581562519, | |
| "rewards/format_reward": 0.4583333358168602, | |
| "step": 276 | |
| }, | |
| { | |
| "completion_length": 2416.9375228881836, | |
| "epoch": 0.31657142857142856, | |
| "grad_norm": 0.0879264622926712, | |
| "kl": 2.4471431970596313e-05, | |
| "learning_rate": 5.468584328659172e-07, | |
| "loss": 0.0, | |
| "reward": 0.3515456095337868, | |
| "reward_std": 0.5202660392969847, | |
| "rewards/cosine_scaled_reward": -0.03256054222583771, | |
| "rewards/format_reward": 0.4166666679084301, | |
| "step": 277 | |
| }, | |
| { | |
| "completion_length": 1954.9167098999023, | |
| "epoch": 0.3177142857142857, | |
| "grad_norm": 0.10697464644908905, | |
| "kl": 5.552172660827637e-05, | |
| "learning_rate": 5.437170188473847e-07, | |
| "loss": 0.0, | |
| "reward": 0.9258935023099184, | |
| "reward_std": 0.8555895313620567, | |
| "rewards/cosine_scaled_reward": 0.12961340113542974, | |
| "rewards/format_reward": 0.6666666734963655, | |
| "step": 278 | |
| }, | |
| { | |
| "completion_length": 3167.3958587646484, | |
| "epoch": 0.31885714285714284, | |
| "grad_norm": 0.0680970847606659, | |
| "kl": 1.3820827007293701e-05, | |
| "learning_rate": 5.405759110524894e-07, | |
| "loss": 0.0, | |
| "reward": -0.24713177233934402, | |
| "reward_std": 0.3474461454898119, | |
| "rewards/cosine_scaled_reward": -0.2173158871009946, | |
| "rewards/format_reward": 0.18750000186264515, | |
| "step": 279 | |
| }, | |
| { | |
| "completion_length": 2158.0416717529297, | |
| "epoch": 0.32, | |
| "grad_norm": 0.13484935462474823, | |
| "kl": 3.2179057598114014e-05, | |
| "learning_rate": 5.37435262574394e-07, | |
| "loss": 0.0, | |
| "reward": 0.6351247914135456, | |
| "reward_std": 0.7088255118578672, | |
| "rewards/cosine_scaled_reward": 0.04672905756160617, | |
| "rewards/format_reward": 0.5416666679084301, | |
| "step": 280 | |
| }, | |
| { | |
| "completion_length": 3522.2291870117188, | |
| "epoch": 0.3211428571428571, | |
| "grad_norm": 0.061760205775499344, | |
| "kl": 6.163492798805237e-06, | |
| "learning_rate": 5.342952264838747e-07, | |
| "loss": 0.0, | |
| "reward": -0.3107418417930603, | |
| "reward_std": 0.4785036947578192, | |
| "rewards/cosine_scaled_reward": -0.18662092182785273, | |
| "rewards/format_reward": 0.06250000186264515, | |
| "step": 281 | |
| }, | |
| { | |
| "completion_length": 2782.562526702881, | |
| "epoch": 0.3222857142857143, | |
| "grad_norm": 0.07456035166978836, | |
| "kl": 6.871763616800308e-06, | |
| "learning_rate": 5.311559558218603e-07, | |
| "loss": 0.0, | |
| "reward": 0.6127997636795044, | |
| "reward_std": 0.751378882676363, | |
| "rewards/cosine_scaled_reward": 0.09806656092405319, | |
| "rewards/format_reward": 0.41666666977107525, | |
| "step": 282 | |
| }, | |
| { | |
| "completion_length": 2870.8750534057617, | |
| "epoch": 0.32342857142857145, | |
| "grad_norm": 0.07354775816202164, | |
| "kl": 8.830800652503967e-06, | |
| "learning_rate": 5.28017603591974e-07, | |
| "loss": 0.0, | |
| "reward": 0.7005033940076828, | |
| "reward_std": 1.0091945696622133, | |
| "rewards/cosine_scaled_reward": 0.1002516932785511, | |
| "rewards/format_reward": 0.5000000037252903, | |
| "step": 283 | |
| }, | |
| { | |
| "completion_length": 2088.145854949951, | |
| "epoch": 0.32457142857142857, | |
| "grad_norm": 0.13238109648227692, | |
| "kl": 3.805011510848999e-05, | |
| "learning_rate": 5.248803227530763e-07, | |
| "loss": 0.0, | |
| "reward": 0.6708045080304146, | |
| "reward_std": 0.8319668397307396, | |
| "rewards/cosine_scaled_reward": 0.03331891680136323, | |
| "rewards/format_reward": 0.6041666679084301, | |
| "step": 284 | |
| }, | |
| { | |
| "completion_length": 2609.8333892822266, | |
| "epoch": 0.32571428571428573, | |
| "grad_norm": 0.06779658049345016, | |
| "kl": 7.536262273788452e-06, | |
| "learning_rate": 5.21744266211809e-07, | |
| "loss": 0.0, | |
| "reward": 0.30458386801183224, | |
| "reward_std": 0.35890259593725204, | |
| "rewards/cosine_scaled_reward": -0.1081247329711914, | |
| "rewards/format_reward": 0.5208333358168602, | |
| "step": 285 | |
| }, | |
| { | |
| "completion_length": 2613.000030517578, | |
| "epoch": 0.32685714285714285, | |
| "grad_norm": 0.07178284972906113, | |
| "kl": 3.168056719005108e-05, | |
| "learning_rate": 5.186095868151436e-07, | |
| "loss": 0.0, | |
| "reward": 0.44351696502417326, | |
| "reward_std": 0.8193777613341808, | |
| "rewards/cosine_scaled_reward": -0.028241521678864956, | |
| "rewards/format_reward": 0.5000000149011612, | |
| "step": 286 | |
| }, | |
| { | |
| "completion_length": 2048.8541946411133, | |
| "epoch": 0.328, | |
| "grad_norm": 0.11140727996826172, | |
| "kl": 4.149973392486572e-05, | |
| "learning_rate": 5.154764373429315e-07, | |
| "loss": 0.0, | |
| "reward": 0.5404299832880497, | |
| "reward_std": 0.614000715315342, | |
| "rewards/cosine_scaled_reward": -0.011035013943910599, | |
| "rewards/format_reward": 0.5625, | |
| "step": 287 | |
| }, | |
| { | |
| "completion_length": 3102.166717529297, | |
| "epoch": 0.3291428571428571, | |
| "grad_norm": 0.05545170232653618, | |
| "kl": 1.2880191206932068e-05, | |
| "learning_rate": 5.123449705004581e-07, | |
| "loss": 0.0, | |
| "reward": 0.12805089727044106, | |
| "reward_std": 0.6392678469419479, | |
| "rewards/cosine_scaled_reward": -0.11305789090692997, | |
| "rewards/format_reward": 0.3541666753590107, | |
| "step": 288 | |
| }, | |
| { | |
| "completion_length": 2153.0833473205566, | |
| "epoch": 0.3302857142857143, | |
| "grad_norm": 0.10602433234453201, | |
| "kl": 2.4878885596990585e-05, | |
| "learning_rate": 5.09215338910999e-07, | |
| "loss": 0.0, | |
| "reward": 0.5227187471464276, | |
| "reward_std": 0.4022186156362295, | |
| "rewards/cosine_scaled_reward": 0.0009426753968000412, | |
| "rewards/format_reward": 0.520833333954215, | |
| "step": 289 | |
| }, | |
| { | |
| "completion_length": 1898.37504196167, | |
| "epoch": 0.3314285714285714, | |
| "grad_norm": 0.10970935970544815, | |
| "kl": 2.4582259356975555e-05, | |
| "learning_rate": 5.060876951083828e-07, | |
| "loss": 0.0, | |
| "reward": 0.7429512850940228, | |
| "reward_std": 0.799828241346404, | |
| "rewards/cosine_scaled_reward": 0.006892322940984741, | |
| "rewards/format_reward": 0.729166679084301, | |
| "step": 290 | |
| }, | |
| { | |
| "completion_length": 2421.375030517578, | |
| "epoch": 0.3325714285714286, | |
| "grad_norm": 0.07605387270450592, | |
| "kl": 9.318813681602478e-06, | |
| "learning_rate": 5.02962191529556e-07, | |
| "loss": 0.0, | |
| "reward": 0.9701324142515659, | |
| "reward_std": 0.9617302333936095, | |
| "rewards/cosine_scaled_reward": 0.20381621085107327, | |
| "rewards/format_reward": 0.562500013038516, | |
| "step": 291 | |
| }, | |
| { | |
| "completion_length": 3169.0208740234375, | |
| "epoch": 0.33371428571428574, | |
| "grad_norm": 0.05581577122211456, | |
| "kl": 7.430091500282288e-06, | |
| "learning_rate": 4.998389805071536e-07, | |
| "loss": 0.0, | |
| "reward": -0.2259244667366147, | |
| "reward_std": 0.5163008309900761, | |
| "rewards/cosine_scaled_reward": -0.2587955743074417, | |
| "rewards/format_reward": 0.29166666977107525, | |
| "step": 292 | |
| }, | |
| { | |
| "completion_length": 2708.645881652832, | |
| "epoch": 0.33485714285714285, | |
| "grad_norm": 0.07145675271749496, | |
| "kl": 2.934178337454796e-05, | |
| "learning_rate": 4.967182142620745e-07, | |
| "loss": 0.0, | |
| "reward": 0.21718984958715737, | |
| "reward_std": 0.6624874398112297, | |
| "rewards/cosine_scaled_reward": -0.14140508696436882, | |
| "rewards/format_reward": 0.5000000111758709, | |
| "step": 293 | |
| }, | |
| { | |
| "completion_length": 3121.1041870117188, | |
| "epoch": 0.336, | |
| "grad_norm": 0.07515346258878708, | |
| "kl": 7.856637239456177e-06, | |
| "learning_rate": 4.93600044896063e-07, | |
| "loss": 0.0, | |
| "reward": 0.021071203984320164, | |
| "reward_std": 0.6561331935226917, | |
| "rewards/cosine_scaled_reward": -0.11446439195424318, | |
| "rewards/format_reward": 0.2500000037252903, | |
| "step": 294 | |
| }, | |
| { | |
| "completion_length": 3187.6666870117188, | |
| "epoch": 0.33714285714285713, | |
| "grad_norm": 0.07366888970136642, | |
| "kl": 8.402857929468155e-06, | |
| "learning_rate": 4.904846243842949e-07, | |
| "loss": 0.0, | |
| "reward": 0.044772399589419365, | |
| "reward_std": 0.594879874959588, | |
| "rewards/cosine_scaled_reward": -0.08178047463297844, | |
| "rewards/format_reward": 0.20833333395421505, | |
| "step": 295 | |
| }, | |
| { | |
| "completion_length": 3203.479217529297, | |
| "epoch": 0.3382857142857143, | |
| "grad_norm": 0.07104936242103577, | |
| "kl": 3.190338611602783e-05, | |
| "learning_rate": 4.873721045679706e-07, | |
| "loss": 0.0, | |
| "reward": -0.0436970517039299, | |
| "reward_std": 0.8523005917668343, | |
| "rewards/cosine_scaled_reward": -0.16768185701221228, | |
| "rewards/format_reward": 0.2916666753590107, | |
| "step": 296 | |
| }, | |
| { | |
| "completion_length": 3543.7291870117188, | |
| "epoch": 0.3394285714285714, | |
| "grad_norm": 0.04666190966963768, | |
| "kl": -3.3173710107803345e-06, | |
| "learning_rate": 4.842626371469149e-07, | |
| "loss": -0.0, | |
| "reward": -0.18557041604071856, | |
| "reward_std": 0.6471944972872734, | |
| "rewards/cosine_scaled_reward": -0.14486854104325175, | |
| "rewards/format_reward": 0.1041666679084301, | |
| "step": 297 | |
| }, | |
| { | |
| "completion_length": 2919.7292098999023, | |
| "epoch": 0.3405714285714286, | |
| "grad_norm": 0.059126850217580795, | |
| "kl": 6.070360541343689e-06, | |
| "learning_rate": 4.811563736721829e-07, | |
| "loss": 0.0, | |
| "reward": 0.34876400977373123, | |
| "reward_std": 0.6880132034420967, | |
| "rewards/cosine_scaled_reward": -0.02353466209024191, | |
| "rewards/format_reward": 0.3958333358168602, | |
| "step": 298 | |
| }, | |
| { | |
| "completion_length": 3204.2291870117188, | |
| "epoch": 0.3417142857142857, | |
| "grad_norm": 0.05692117661237717, | |
| "kl": 4.030764102935791e-05, | |
| "learning_rate": 4.780534655386743e-07, | |
| "loss": 0.0, | |
| "reward": -0.13842911273241043, | |
| "reward_std": 0.6392941474914551, | |
| "rewards/cosine_scaled_reward": -0.16296455450356007, | |
| "rewards/format_reward": 0.1875, | |
| "step": 299 | |
| }, | |
| { | |
| "completion_length": 3583.4583435058594, | |
| "epoch": 0.34285714285714286, | |
| "grad_norm": 0.053756847977638245, | |
| "kl": 8.56444239616394e-06, | |
| "learning_rate": 4.749540639777539e-07, | |
| "loss": 0.0, | |
| "reward": -0.5124701708555222, | |
| "reward_std": 0.3968875277787447, | |
| "rewards/cosine_scaled_reward": -0.27706841565668583, | |
| "rewards/format_reward": 0.0416666679084301, | |
| "step": 300 | |
| }, | |
| { | |
| "completion_length": 2341.8542404174805, | |
| "epoch": 0.344, | |
| "grad_norm": 0.09239984303712845, | |
| "kl": 2.6598572731018066e-05, | |
| "learning_rate": 4.7185832004988133e-07, | |
| "loss": 0.0, | |
| "reward": 0.3013797178864479, | |
| "reward_std": 0.5681559406220913, | |
| "rewards/cosine_scaled_reward": -0.17222682666033506, | |
| "rewards/format_reward": 0.6458333358168602, | |
| "step": 301 | |
| }, | |
| { | |
| "completion_length": 2407.4375228881836, | |
| "epoch": 0.34514285714285714, | |
| "grad_norm": 0.09763790667057037, | |
| "kl": 2.8986483812332153e-05, | |
| "learning_rate": 4.68766384637248e-07, | |
| "loss": 0.0, | |
| "reward": 0.6301762219518423, | |
| "reward_std": 0.7140982802957296, | |
| "rewards/cosine_scaled_reward": 0.0859214163501747, | |
| "rewards/format_reward": 0.45833333395421505, | |
| "step": 302 | |
| }, | |
| { | |
| "completion_length": 2425.7708740234375, | |
| "epoch": 0.3462857142857143, | |
| "grad_norm": 0.10430561006069183, | |
| "kl": 3.635883331298828e-05, | |
| "learning_rate": 4.656784084364238e-07, | |
| "loss": 0.0, | |
| "reward": 0.324604582041502, | |
| "reward_std": 0.8040589466691017, | |
| "rewards/cosine_scaled_reward": -0.09811437479220331, | |
| "rewards/format_reward": 0.5208333395421505, | |
| "step": 303 | |
| }, | |
| { | |
| "completion_length": 2782.08341217041, | |
| "epoch": 0.3474285714285714, | |
| "grad_norm": 0.07239022850990295, | |
| "kl": 2.8092414140701294e-05, | |
| "learning_rate": 4.6259454195101267e-07, | |
| "loss": 0.0, | |
| "reward": 0.37165635358542204, | |
| "reward_std": 0.9195777922868729, | |
| "rewards/cosine_scaled_reward": -0.043338497169315815, | |
| "rewards/format_reward": 0.45833333767950535, | |
| "step": 304 | |
| }, | |
| { | |
| "completion_length": 3050.5416870117188, | |
| "epoch": 0.3485714285714286, | |
| "grad_norm": 0.07014129310846329, | |
| "kl": 1.5946105122566223e-05, | |
| "learning_rate": 4.59514935484316e-07, | |
| "loss": 0.0, | |
| "reward": 0.15215962007641792, | |
| "reward_std": 0.43489981070160866, | |
| "rewards/cosine_scaled_reward": -0.08017018809914589, | |
| "rewards/format_reward": 0.3125000074505806, | |
| "step": 305 | |
| }, | |
| { | |
| "completion_length": 2274.541706085205, | |
| "epoch": 0.3497142857142857, | |
| "grad_norm": 0.0855133906006813, | |
| "kl": 1.9159168004989624e-05, | |
| "learning_rate": 4.5643973913200837e-07, | |
| "loss": 0.0, | |
| "reward": 0.44224592950195074, | |
| "reward_std": 0.6582063380628824, | |
| "rewards/cosine_scaled_reward": -0.04971036873757839, | |
| "rewards/format_reward": 0.5416666716337204, | |
| "step": 306 | |
| }, | |
| { | |
| "completion_length": 2534.5417404174805, | |
| "epoch": 0.35085714285714287, | |
| "grad_norm": 0.105194091796875, | |
| "kl": 3.082305192947388e-05, | |
| "learning_rate": 4.5336910277482155e-07, | |
| "loss": 0.0, | |
| "reward": 0.41228330321609974, | |
| "reward_std": 0.796582717448473, | |
| "rewards/cosine_scaled_reward": -0.05427503399550915, | |
| "rewards/format_reward": 0.5208333469927311, | |
| "step": 307 | |
| }, | |
| { | |
| "completion_length": 3426.5208435058594, | |
| "epoch": 0.352, | |
| "grad_norm": 0.05296296626329422, | |
| "kl": 4.692934453487396e-06, | |
| "learning_rate": 4.503031760712397e-07, | |
| "loss": 0.0, | |
| "reward": -0.166522528976202, | |
| "reward_std": 0.5611686408519745, | |
| "rewards/cosine_scaled_reward": -0.1770112719386816, | |
| "rewards/format_reward": 0.1875000074505806, | |
| "step": 308 | |
| }, | |
| { | |
| "completion_length": 3382.5208435058594, | |
| "epoch": 0.35314285714285715, | |
| "grad_norm": 0.051009587943553925, | |
| "kl": -7.729977369308472e-06, | |
| "learning_rate": 4.4724210845020494e-07, | |
| "loss": -0.0, | |
| "reward": 0.2159974593669176, | |
| "reward_std": 0.5677429530769587, | |
| "rewards/cosine_scaled_reward": -0.027417936362326145, | |
| "rewards/format_reward": 0.2708333358168602, | |
| "step": 309 | |
| }, | |
| { | |
| "completion_length": 2379.166675567627, | |
| "epoch": 0.35428571428571426, | |
| "grad_norm": 0.12714841961860657, | |
| "kl": 4.4152140617370605e-05, | |
| "learning_rate": 4.441860491038345e-07, | |
| "loss": 0.0, | |
| "reward": 0.170193150639534, | |
| "reward_std": 0.7661439292132854, | |
| "rewards/cosine_scaled_reward": -0.16490342654287815, | |
| "rewards/format_reward": 0.5000000037252903, | |
| "step": 310 | |
| }, | |
| { | |
| "completion_length": 2530.1250610351562, | |
| "epoch": 0.3554285714285714, | |
| "grad_norm": 0.09034020453691483, | |
| "kl": 5.436129868030548e-06, | |
| "learning_rate": 4.4113514698014953e-07, | |
| "loss": 0.0, | |
| "reward": 0.6840454712510109, | |
| "reward_std": 0.534562312066555, | |
| "rewards/cosine_scaled_reward": 0.060772710712626576, | |
| "rewards/format_reward": 0.5625, | |
| "step": 311 | |
| }, | |
| { | |
| "completion_length": 2089.416679382324, | |
| "epoch": 0.3565714285714286, | |
| "grad_norm": 0.08504354953765869, | |
| "kl": 2.7917325496673584e-05, | |
| "learning_rate": 4.3808955077581546e-07, | |
| "loss": 0.0, | |
| "reward": 0.769673366099596, | |
| "reward_std": 0.603489238768816, | |
| "rewards/cosine_scaled_reward": 0.12441998813301325, | |
| "rewards/format_reward": 0.5208333395421505, | |
| "step": 312 | |
| }, | |
| { | |
| "completion_length": 3068.7291870117188, | |
| "epoch": 0.3577142857142857, | |
| "grad_norm": 0.06918621063232422, | |
| "kl": -2.9616057872772217e-07, | |
| "learning_rate": 4.350494089288943e-07, | |
| "loss": -0.0, | |
| "reward": 0.6292300410568714, | |
| "reward_std": 0.8956102449446917, | |
| "rewards/cosine_scaled_reward": 0.09586502579622902, | |
| "rewards/format_reward": 0.43750000931322575, | |
| "step": 313 | |
| }, | |
| { | |
| "completion_length": 2325.895851135254, | |
| "epoch": 0.3588571428571429, | |
| "grad_norm": 0.0934160128235817, | |
| "kl": 1.2023141607642174e-05, | |
| "learning_rate": 4.3201486961161093e-07, | |
| "loss": 0.0, | |
| "reward": 0.515046302229166, | |
| "reward_std": 0.6036827210336924, | |
| "rewards/cosine_scaled_reward": -0.02372686006128788, | |
| "rewards/format_reward": 0.5625000149011612, | |
| "step": 314 | |
| }, | |
| { | |
| "completion_length": 2916.812515258789, | |
| "epoch": 0.36, | |
| "grad_norm": 0.07668380439281464, | |
| "kl": 1.3717450201511383e-05, | |
| "learning_rate": 4.2898608072313045e-07, | |
| "loss": 0.0, | |
| "reward": 0.19932079315185547, | |
| "reward_std": 0.5037313140928745, | |
| "rewards/cosine_scaled_reward": -0.05658959038555622, | |
| "rewards/format_reward": 0.3125, | |
| "step": 315 | |
| }, | |
| { | |
| "completion_length": 3537.791717529297, | |
| "epoch": 0.36114285714285715, | |
| "grad_norm": 0.05294550582766533, | |
| "kl": 2.2545456886291504e-05, | |
| "learning_rate": 4.2596318988235037e-07, | |
| "loss": 0.0, | |
| "reward": -0.3032747507095337, | |
| "reward_std": 0.6202223412692547, | |
| "rewards/cosine_scaled_reward": -0.21413737814873457, | |
| "rewards/format_reward": 0.1250000037252903, | |
| "step": 316 | |
| }, | |
| { | |
| "completion_length": 2984.687515258789, | |
| "epoch": 0.36228571428571427, | |
| "grad_norm": 0.06460604816675186, | |
| "kl": 2.434849739074707e-05, | |
| "learning_rate": 4.2294634442070553e-07, | |
| "loss": 0.0, | |
| "reward": 0.03477427735924721, | |
| "reward_std": 0.3998127859085798, | |
| "rewards/cosine_scaled_reward": -0.11802953109145164, | |
| "rewards/format_reward": 0.27083333395421505, | |
| "step": 317 | |
| }, | |
| { | |
| "completion_length": 1796.6250305175781, | |
| "epoch": 0.36342857142857143, | |
| "grad_norm": 0.10165087133646011, | |
| "kl": 3.101211041212082e-05, | |
| "learning_rate": 4.1993569137498776e-07, | |
| "loss": 0.0, | |
| "reward": 0.6068488396704197, | |
| "reward_std": 0.797613125294447, | |
| "rewards/cosine_scaled_reward": -0.08199225179851055, | |
| "rewards/format_reward": 0.7708333507180214, | |
| "step": 318 | |
| }, | |
| { | |
| "completion_length": 2778.8750228881836, | |
| "epoch": 0.36457142857142855, | |
| "grad_norm": 0.10283314436674118, | |
| "kl": 3.739539533853531e-05, | |
| "learning_rate": 4.1693137748017915e-07, | |
| "loss": 0.0, | |
| "reward": -0.05662496015429497, | |
| "reward_std": 0.4763899613171816, | |
| "rewards/cosine_scaled_reward": -0.1741458149626851, | |
| "rewards/format_reward": 0.2916666679084301, | |
| "step": 319 | |
| }, | |
| { | |
| "completion_length": 1923.8542022705078, | |
| "epoch": 0.3657142857142857, | |
| "grad_norm": 0.12830467522144318, | |
| "kl": 5.918368697166443e-05, | |
| "learning_rate": 4.1393354916230005e-07, | |
| "loss": 0.0, | |
| "reward": 0.3291757007391425, | |
| "reward_std": 0.6098171658813953, | |
| "rewards/cosine_scaled_reward": -0.15832881536334753, | |
| "rewards/format_reward": 0.6458333395421505, | |
| "step": 320 | |
| }, | |
| { | |
| "completion_length": 2005.020881652832, | |
| "epoch": 0.3668571428571429, | |
| "grad_norm": 0.0802675187587738, | |
| "kl": 1.8093734979629517e-05, | |
| "learning_rate": 4.1094235253127374e-07, | |
| "loss": 0.0, | |
| "reward": 0.7025870578363538, | |
| "reward_std": 0.7272794898599386, | |
| "rewards/cosine_scaled_reward": 0.007543525251094252, | |
| "rewards/format_reward": 0.687500013038516, | |
| "step": 321 | |
| }, | |
| { | |
| "completion_length": 2775.9166717529297, | |
| "epoch": 0.368, | |
| "grad_norm": 0.09749241173267365, | |
| "kl": 3.974884748458862e-05, | |
| "learning_rate": 4.079579333738039e-07, | |
| "loss": 0.0, | |
| "reward": -0.10498743131756783, | |
| "reward_std": 0.5231676623225212, | |
| "rewards/cosine_scaled_reward": -0.19832705100998282, | |
| "rewards/format_reward": 0.2916666679084301, | |
| "step": 322 | |
| }, | |
| { | |
| "completion_length": 2976.4583587646484, | |
| "epoch": 0.36914285714285716, | |
| "grad_norm": 0.07950250804424286, | |
| "kl": 7.014721632003784e-06, | |
| "learning_rate": 4.0498043714627006e-07, | |
| "loss": 0.0, | |
| "reward": 0.2224793629720807, | |
| "reward_std": 0.6783848963677883, | |
| "rewards/cosine_scaled_reward": -0.07626031711697578, | |
| "rewards/format_reward": 0.37500000186264515, | |
| "step": 323 | |
| }, | |
| { | |
| "completion_length": 2724.3958740234375, | |
| "epoch": 0.3702857142857143, | |
| "grad_norm": 0.07554537057876587, | |
| "kl": 1.7061247490346432e-05, | |
| "learning_rate": 4.020100089676376e-07, | |
| "loss": 0.0, | |
| "reward": 0.4769489490427077, | |
| "reward_std": 0.501129625365138, | |
| "rewards/cosine_scaled_reward": -0.011525534908287227, | |
| "rewards/format_reward": 0.5000000055879354, | |
| "step": 324 | |
| }, | |
| { | |
| "completion_length": 2862.937530517578, | |
| "epoch": 0.37142857142857144, | |
| "grad_norm": 0.0805167555809021, | |
| "kl": 1.590559259057045e-05, | |
| "learning_rate": 3.9904679361238526e-07, | |
| "loss": 0.0, | |
| "reward": 0.6346502639353275, | |
| "reward_std": 0.8927598744630814, | |
| "rewards/cosine_scaled_reward": 0.11940845707431436, | |
| "rewards/format_reward": 0.39583333767950535, | |
| "step": 325 | |
| }, | |
| { | |
| "completion_length": 2362.145851135254, | |
| "epoch": 0.37257142857142855, | |
| "grad_norm": 0.07862798124551773, | |
| "kl": 1.3891607522964478e-05, | |
| "learning_rate": 3.9609093550344907e-07, | |
| "loss": 0.0, | |
| "reward": 0.530715137720108, | |
| "reward_std": 0.3939795885235071, | |
| "rewards/cosine_scaled_reward": 0.004940897226333618, | |
| "rewards/format_reward": 0.5208333395421505, | |
| "step": 326 | |
| }, | |
| { | |
| "completion_length": 2695.7916679382324, | |
| "epoch": 0.3737142857142857, | |
| "grad_norm": 0.06528741866350174, | |
| "kl": 4.3451786041259766e-05, | |
| "learning_rate": 3.931425787051832e-07, | |
| "loss": 0.0, | |
| "reward": 0.592552687972784, | |
| "reward_std": 0.4058069456368685, | |
| "rewards/cosine_scaled_reward": 0.1087763411924243, | |
| "rewards/format_reward": 0.375, | |
| "step": 327 | |
| }, | |
| { | |
| "completion_length": 3534.1041870117188, | |
| "epoch": 0.37485714285714283, | |
| "grad_norm": 0.055547695606946945, | |
| "kl": 1.4275312423706055e-05, | |
| "learning_rate": 3.902018669163384e-07, | |
| "loss": 0.0, | |
| "reward": -0.3857603585347533, | |
| "reward_std": 0.4141149949282408, | |
| "rewards/cosine_scaled_reward": -0.213713513687253, | |
| "rewards/format_reward": 0.0416666679084301, | |
| "step": 328 | |
| }, | |
| { | |
| "completion_length": 2279.041679382324, | |
| "epoch": 0.376, | |
| "grad_norm": 0.10334432125091553, | |
| "kl": 3.980100154876709e-05, | |
| "learning_rate": 3.872689434630585e-07, | |
| "loss": 0.0, | |
| "reward": 0.4612154234200716, | |
| "reward_std": 0.8942649606615305, | |
| "rewards/cosine_scaled_reward": -0.019392302725464106, | |
| "rewards/format_reward": 0.5000000055879354, | |
| "step": 329 | |
| }, | |
| { | |
| "completion_length": 2104.020866394043, | |
| "epoch": 0.37714285714285717, | |
| "grad_norm": 0.13447019457817078, | |
| "kl": 4.4308602809906006e-05, | |
| "learning_rate": 3.843439512918949e-07, | |
| "loss": 0.0, | |
| "reward": 0.2256888933479786, | |
| "reward_std": 0.5865146033465862, | |
| "rewards/cosine_scaled_reward": -0.1892388891428709, | |
| "rewards/format_reward": 0.6041666716337204, | |
| "step": 330 | |
| }, | |
| { | |
| "completion_length": 2383.1458435058594, | |
| "epoch": 0.3782857142857143, | |
| "grad_norm": 0.12252910435199738, | |
| "kl": 7.56382942199707e-05, | |
| "learning_rate": 3.8142703296283953e-07, | |
| "loss": 0.0, | |
| "reward": -0.05577260535210371, | |
| "reward_std": 0.4654993861913681, | |
| "rewards/cosine_scaled_reward": -0.24663630314171314, | |
| "rewards/format_reward": 0.4375000074505806, | |
| "step": 331 | |
| }, | |
| { | |
| "completion_length": 2325.8541946411133, | |
| "epoch": 0.37942857142857145, | |
| "grad_norm": 0.08140832185745239, | |
| "kl": 1.0751187801361084e-05, | |
| "learning_rate": 3.785183306423767e-07, | |
| "loss": 0.0, | |
| "reward": 0.38908466696739197, | |
| "reward_std": 0.47468874510377645, | |
| "rewards/cosine_scaled_reward": -0.06587433069944382, | |
| "rewards/format_reward": 0.5208333395421505, | |
| "step": 332 | |
| }, | |
| { | |
| "completion_length": 2367.9375534057617, | |
| "epoch": 0.38057142857142856, | |
| "grad_norm": 0.0820142850279808, | |
| "kl": 1.646578311920166e-05, | |
| "learning_rate": 3.7561798609655373e-07, | |
| "loss": 0.0, | |
| "reward": 0.5586533015593886, | |
| "reward_std": 0.8029082752764225, | |
| "rewards/cosine_scaled_reward": -0.0019233636558055878, | |
| "rewards/format_reward": 0.5625000037252903, | |
| "step": 333 | |
| }, | |
| { | |
| "completion_length": 3402.250030517578, | |
| "epoch": 0.38171428571428573, | |
| "grad_norm": 0.054412227123975754, | |
| "kl": -5.424022674560547e-06, | |
| "learning_rate": 3.72726140684072e-07, | |
| "loss": -0.0, | |
| "reward": -0.1353170946240425, | |
| "reward_std": 0.5321265533566475, | |
| "rewards/cosine_scaled_reward": -0.15099187567830086, | |
| "rewards/format_reward": 0.16666666977107525, | |
| "step": 334 | |
| }, | |
| { | |
| "completion_length": 2337.500015258789, | |
| "epoch": 0.38285714285714284, | |
| "grad_norm": 0.08019620180130005, | |
| "kl": 1.567695289850235e-05, | |
| "learning_rate": 3.6984293534939737e-07, | |
| "loss": 0.0, | |
| "reward": 0.7398268207907677, | |
| "reward_std": 0.5326191149652004, | |
| "rewards/cosine_scaled_reward": 0.05741339363157749, | |
| "rewards/format_reward": 0.6250000055879354, | |
| "step": 335 | |
| }, | |
| { | |
| "completion_length": 2971.791717529297, | |
| "epoch": 0.384, | |
| "grad_norm": 0.08180355280637741, | |
| "kl": 4.472024738788605e-05, | |
| "learning_rate": 3.6696851061588994e-07, | |
| "loss": 0.0, | |
| "reward": 0.1902475543320179, | |
| "reward_std": 0.920977134257555, | |
| "rewards/cosine_scaled_reward": -0.09237623494118452, | |
| "rewards/format_reward": 0.3750000074505806, | |
| "step": 336 | |
| }, | |
| { | |
| "completion_length": 3078.2916870117188, | |
| "epoch": 0.3851428571428571, | |
| "grad_norm": 0.068634033203125, | |
| "kl": 1.064687967300415e-05, | |
| "learning_rate": 3.641030065789562e-07, | |
| "loss": 0.0, | |
| "reward": -0.09998796135187149, | |
| "reward_std": 0.6867964342236519, | |
| "rewards/cosine_scaled_reward": -0.185410650447011, | |
| "rewards/format_reward": 0.27083333395421505, | |
| "step": 337 | |
| }, | |
| { | |
| "completion_length": 2386.6667098999023, | |
| "epoch": 0.3862857142857143, | |
| "grad_norm": 0.1043706014752388, | |
| "kl": 5.0283968448638916e-05, | |
| "learning_rate": 3.612465628992203e-07, | |
| "loss": 0.0, | |
| "reward": 0.5030028827022761, | |
| "reward_std": 0.957062378525734, | |
| "rewards/cosine_scaled_reward": -0.019331891322508454, | |
| "rewards/format_reward": 0.5416666716337204, | |
| "step": 338 | |
| }, | |
| { | |
| "completion_length": 3137.354202270508, | |
| "epoch": 0.38742857142857146, | |
| "grad_norm": 0.07295973598957062, | |
| "kl": 1.12876296043396e-05, | |
| "learning_rate": 3.5839931879571725e-07, | |
| "loss": 0.0, | |
| "reward": -0.05388517398387194, | |
| "reward_std": 0.44416493922472, | |
| "rewards/cosine_scaled_reward": -0.13110925815999508, | |
| "rewards/format_reward": 0.2083333358168602, | |
| "step": 339 | |
| }, | |
| { | |
| "completion_length": 2531.145835876465, | |
| "epoch": 0.38857142857142857, | |
| "grad_norm": 0.08190851658582687, | |
| "kl": 1.7684258637018502e-05, | |
| "learning_rate": 3.555614130391079e-07, | |
| "loss": 0.0, | |
| "reward": 0.08293339982628822, | |
| "reward_std": 0.5905051566660404, | |
| "rewards/cosine_scaled_reward": -0.15644997311756015, | |
| "rewards/format_reward": 0.39583333395421505, | |
| "step": 340 | |
| }, | |
| { | |
| "completion_length": 2409.229217529297, | |
| "epoch": 0.38971428571428574, | |
| "grad_norm": 0.06881759315729141, | |
| "kl": 4.1509512811899185e-05, | |
| "learning_rate": 3.5273298394491515e-07, | |
| "loss": 0.0, | |
| "reward": 0.6894203349947929, | |
| "reward_std": 0.6031067483127117, | |
| "rewards/cosine_scaled_reward": 0.0634601479396224, | |
| "rewards/format_reward": 0.5625000074505806, | |
| "step": 341 | |
| }, | |
| { | |
| "completion_length": 2857.875015258789, | |
| "epoch": 0.39085714285714285, | |
| "grad_norm": 0.09913033992052078, | |
| "kl": 2.505001612007618e-05, | |
| "learning_rate": 3.4991416936678276e-07, | |
| "loss": 0.0, | |
| "reward": 0.3237498328089714, | |
| "reward_std": 0.6699114330112934, | |
| "rewards/cosine_scaled_reward": 0.0056248996406793594, | |
| "rewards/format_reward": 0.3125, | |
| "step": 342 | |
| }, | |
| { | |
| "completion_length": 3302.166717529297, | |
| "epoch": 0.392, | |
| "grad_norm": 0.059820495545864105, | |
| "kl": 1.7999671399593353e-05, | |
| "learning_rate": 3.471051066897562e-07, | |
| "loss": 0.0, | |
| "reward": 0.3039425928145647, | |
| "reward_std": 0.9074053410440683, | |
| "rewards/cosine_scaled_reward": -0.025112037546932697, | |
| "rewards/format_reward": 0.3541666753590107, | |
| "step": 343 | |
| }, | |
| { | |
| "completion_length": 2413.2291946411133, | |
| "epoch": 0.3931428571428571, | |
| "grad_norm": 0.07528231292963028, | |
| "kl": 1.543806865811348e-05, | |
| "learning_rate": 3.4430593282358777e-07, | |
| "loss": 0.0, | |
| "reward": 0.9950521737337112, | |
| "reward_std": 0.5791758988052607, | |
| "rewards/cosine_scaled_reward": 0.21627607801929116, | |
| "rewards/format_reward": 0.5625000074505806, | |
| "step": 344 | |
| }, | |
| { | |
| "completion_length": 2863.125030517578, | |
| "epoch": 0.3942857142857143, | |
| "grad_norm": 0.10458476841449738, | |
| "kl": 5.9917569160461426e-05, | |
| "learning_rate": 3.4151678419606233e-07, | |
| "loss": 0.0, | |
| "reward": 0.10197159834206104, | |
| "reward_std": 0.6234806291759014, | |
| "rewards/cosine_scaled_reward": -0.11568087711930275, | |
| "rewards/format_reward": 0.3333333358168602, | |
| "step": 345 | |
| }, | |
| { | |
| "completion_length": 3230.0208740234375, | |
| "epoch": 0.3954285714285714, | |
| "grad_norm": 0.053032681345939636, | |
| "kl": 1.961551606655121e-05, | |
| "learning_rate": 3.387377967463493e-07, | |
| "loss": 0.0, | |
| "reward": -0.21714960876852274, | |
| "reward_std": 0.4984629061073065, | |
| "rewards/cosine_scaled_reward": -0.24399147182703018, | |
| "rewards/format_reward": 0.27083334140479565, | |
| "step": 346 | |
| }, | |
| { | |
| "completion_length": 3129.8958587646484, | |
| "epoch": 0.3965714285714286, | |
| "grad_norm": 0.06421928852796555, | |
| "kl": 3.246590495109558e-06, | |
| "learning_rate": 3.359691059183761e-07, | |
| "loss": 0.0, | |
| "reward": -0.16860579699277878, | |
| "reward_std": 0.4869399704039097, | |
| "rewards/cosine_scaled_reward": -0.2197195701301098, | |
| "rewards/format_reward": 0.2708333395421505, | |
| "step": 347 | |
| }, | |
| { | |
| "completion_length": 2817.7708587646484, | |
| "epoch": 0.3977142857142857, | |
| "grad_norm": 0.08860686421394348, | |
| "kl": 1.8882215954363346e-05, | |
| "learning_rate": 3.3321084665422803e-07, | |
| "loss": 0.0, | |
| "reward": 0.21238555945456028, | |
| "reward_std": 0.5841794312000275, | |
| "rewards/cosine_scaled_reward": -0.09172389656305313, | |
| "rewards/format_reward": 0.3958333358168602, | |
| "step": 348 | |
| }, | |
| { | |
| "completion_length": 2873.9791870117188, | |
| "epoch": 0.39885714285714285, | |
| "grad_norm": 0.07359158992767334, | |
| "kl": 3.639981150627136e-05, | |
| "learning_rate": 3.3046315338757026e-07, | |
| "loss": 0.0, | |
| "reward": 0.38269438967108727, | |
| "reward_std": 0.6001935303211212, | |
| "rewards/cosine_scaled_reward": -0.02740282192826271, | |
| "rewards/format_reward": 0.4375000149011612, | |
| "step": 349 | |
| }, | |
| { | |
| "completion_length": 2430.500030517578, | |
| "epoch": 0.4, | |
| "grad_norm": 0.099619559943676, | |
| "kl": 2.70158052444458e-05, | |
| "learning_rate": 3.2772616003709616e-07, | |
| "loss": 0.0, | |
| "reward": 0.42640989646315575, | |
| "reward_std": 0.792506992816925, | |
| "rewards/cosine_scaled_reward": -0.03679506108164787, | |
| "rewards/format_reward": 0.5000000055879354, | |
| "step": 350 | |
| }, | |
| { | |
| "completion_length": 3148.7291870117188, | |
| "epoch": 0.40114285714285713, | |
| "grad_norm": 0.052269671112298965, | |
| "kl": 1.5914440155029297e-05, | |
| "learning_rate": 3.250000000000001e-07, | |
| "loss": 0.0, | |
| "reward": 0.1340247318148613, | |
| "reward_std": 0.4925071559846401, | |
| "rewards/cosine_scaled_reward": -0.0788209717720747, | |
| "rewards/format_reward": 0.2916666716337204, | |
| "step": 351 | |
| }, | |
| { | |
| "completion_length": 2512.5833740234375, | |
| "epoch": 0.4022857142857143, | |
| "grad_norm": 0.07939431071281433, | |
| "kl": 3.6345794796943665e-05, | |
| "learning_rate": 3.222848061454764e-07, | |
| "loss": 0.0, | |
| "reward": 0.4275876134634018, | |
| "reward_std": 0.5682895183563232, | |
| "rewards/cosine_scaled_reward": -0.036206201650202274, | |
| "rewards/format_reward": 0.5000000037252903, | |
| "step": 352 | |
| }, | |
| { | |
| "completion_length": 2411.083366394043, | |
| "epoch": 0.4034285714285714, | |
| "grad_norm": 0.09583939611911774, | |
| "kl": 3.298744559288025e-05, | |
| "learning_rate": 3.195807108082429e-07, | |
| "loss": 0.0, | |
| "reward": 0.28432985534891486, | |
| "reward_std": 0.5493997083976865, | |
| "rewards/cosine_scaled_reward": -0.08700174884870648, | |
| "rewards/format_reward": 0.45833333395421505, | |
| "step": 353 | |
| }, | |
| { | |
| "completion_length": 1941.958351135254, | |
| "epoch": 0.4045714285714286, | |
| "grad_norm": 0.09814055263996124, | |
| "kl": 3.0685216188430786e-05, | |
| "learning_rate": 3.168878457820915e-07, | |
| "loss": 0.0, | |
| "reward": 0.8765768958255649, | |
| "reward_std": 0.8131062537431717, | |
| "rewards/cosine_scaled_reward": 0.13620509393513203, | |
| "rewards/format_reward": 0.6041666772216558, | |
| "step": 354 | |
| }, | |
| { | |
| "completion_length": 2359.500030517578, | |
| "epoch": 0.4057142857142857, | |
| "grad_norm": 0.09458694607019424, | |
| "kl": 3.7375837564468384e-05, | |
| "learning_rate": 3.142063423134644e-07, | |
| "loss": 0.0, | |
| "reward": 0.7504970212467015, | |
| "reward_std": 0.725981093943119, | |
| "rewards/cosine_scaled_reward": 0.0731651596724987, | |
| "rewards/format_reward": 0.6041666734963655, | |
| "step": 355 | |
| }, | |
| { | |
| "completion_length": 2743.6250762939453, | |
| "epoch": 0.40685714285714286, | |
| "grad_norm": 0.06355735659599304, | |
| "kl": 2.419203519821167e-05, | |
| "learning_rate": 3.115363310950578e-07, | |
| "loss": 0.0, | |
| "reward": 0.9549091707449406, | |
| "reward_std": 0.7928859256207943, | |
| "rewards/cosine_scaled_reward": 0.15453789941966534, | |
| "rewards/format_reward": 0.6458333395421505, | |
| "step": 356 | |
| }, | |
| { | |
| "completion_length": 3151.0834045410156, | |
| "epoch": 0.408, | |
| "grad_norm": 0.06935708224773407, | |
| "kl": 3.105774521827698e-05, | |
| "learning_rate": 3.0887794225945143e-07, | |
| "loss": 0.0, | |
| "reward": -0.028852429240942, | |
| "reward_std": 0.6760261207818985, | |
| "rewards/cosine_scaled_reward": -0.14984288439154625, | |
| "rewards/format_reward": 0.27083334140479565, | |
| "step": 357 | |
| }, | |
| { | |
| "completion_length": 2764.770881652832, | |
| "epoch": 0.40914285714285714, | |
| "grad_norm": 0.07202122360467911, | |
| "kl": 1.9414350390434265e-05, | |
| "learning_rate": 3.062313053727671e-07, | |
| "loss": 0.0, | |
| "reward": 0.8628654121421278, | |
| "reward_std": 0.7533247694373131, | |
| "rewards/cosine_scaled_reward": 0.13976600766181946, | |
| "rewards/format_reward": 0.5833333358168602, | |
| "step": 358 | |
| }, | |
| { | |
| "completion_length": 2524.0416870117188, | |
| "epoch": 0.4102857142857143, | |
| "grad_norm": 0.08405191451311111, | |
| "kl": 4.0803104639053345e-05, | |
| "learning_rate": 3.0359654942835247e-07, | |
| "loss": 0.0, | |
| "reward": 0.3434087559580803, | |
| "reward_std": 0.7213883437216282, | |
| "rewards/cosine_scaled_reward": -0.05746229272335768, | |
| "rewards/format_reward": 0.4583333395421505, | |
| "step": 359 | |
| }, | |
| { | |
| "completion_length": 2562.229232788086, | |
| "epoch": 0.4114285714285714, | |
| "grad_norm": 0.08999936282634735, | |
| "kl": 4.67449426651001e-05, | |
| "learning_rate": 3.0097380284049523e-07, | |
| "loss": 0.0, | |
| "reward": 0.6881978586316109, | |
| "reward_std": 0.7717141099274158, | |
| "rewards/cosine_scaled_reward": 0.08368223905563354, | |
| "rewards/format_reward": 0.5208333432674408, | |
| "step": 360 | |
| }, | |
| { | |
| "completion_length": 2951.7500534057617, | |
| "epoch": 0.4125714285714286, | |
| "grad_norm": 0.07436361908912659, | |
| "kl": 2.4508684873580933e-05, | |
| "learning_rate": 2.9836319343816397e-07, | |
| "loss": 0.0, | |
| "reward": 0.41937701031565666, | |
| "reward_std": 0.8368613198399544, | |
| "rewards/cosine_scaled_reward": 0.02218849107157439, | |
| "rewards/format_reward": 0.37500000558793545, | |
| "step": 361 | |
| }, | |
| { | |
| "completion_length": 1571.4791793823242, | |
| "epoch": 0.4137142857142857, | |
| "grad_norm": 0.10805640369653702, | |
| "kl": 4.690699279308319e-05, | |
| "learning_rate": 2.9576484845877793e-07, | |
| "loss": 0.0, | |
| "reward": 0.9413224756717682, | |
| "reward_std": 0.5999411288648844, | |
| "rewards/cosine_scaled_reward": 0.07482788991183043, | |
| "rewards/format_reward": 0.7916666734963655, | |
| "step": 362 | |
| }, | |
| { | |
| "completion_length": 2075.2292098999023, | |
| "epoch": 0.41485714285714287, | |
| "grad_norm": 0.09721305221319199, | |
| "kl": 3.32072377204895e-05, | |
| "learning_rate": 2.931788945420058e-07, | |
| "loss": 0.0, | |
| "reward": 0.8843853026628494, | |
| "reward_std": 0.7552788248285651, | |
| "rewards/cosine_scaled_reward": 0.1296926699578762, | |
| "rewards/format_reward": 0.625, | |
| "step": 363 | |
| }, | |
| { | |
| "completion_length": 2785.583366394043, | |
| "epoch": 0.416, | |
| "grad_norm": 0.07043363153934479, | |
| "kl": 2.8115231543779373e-05, | |
| "learning_rate": 2.9060545772359305e-07, | |
| "loss": 0.0, | |
| "reward": 0.12191003561019897, | |
| "reward_std": 0.598940946161747, | |
| "rewards/cosine_scaled_reward": -0.1682116505689919, | |
| "rewards/format_reward": 0.45833334140479565, | |
| "step": 364 | |
| }, | |
| { | |
| "completion_length": 2823.4375228881836, | |
| "epoch": 0.41714285714285715, | |
| "grad_norm": 0.07927356660366058, | |
| "kl": 2.372264862060547e-05, | |
| "learning_rate": 2.8804466342921987e-07, | |
| "loss": 0.0, | |
| "reward": 0.03796908678486943, | |
| "reward_std": 0.6242504231631756, | |
| "rewards/cosine_scaled_reward": -0.13726545125246048, | |
| "rewards/format_reward": 0.31250000186264515, | |
| "step": 365 | |
| }, | |
| { | |
| "completion_length": 1907.2083568572998, | |
| "epoch": 0.41828571428571426, | |
| "grad_norm": 0.09378702938556671, | |
| "kl": 1.1406838893890381e-05, | |
| "learning_rate": 2.854966364683872e-07, | |
| "loss": 0.0, | |
| "reward": 0.9885353557765484, | |
| "reward_std": 0.5849996041506529, | |
| "rewards/cosine_scaled_reward": 0.12968434672802687, | |
| "rewards/format_reward": 0.7291666716337204, | |
| "step": 366 | |
| }, | |
| { | |
| "completion_length": 2806.5208587646484, | |
| "epoch": 0.41942857142857143, | |
| "grad_norm": 0.05875913426280022, | |
| "kl": 1.8693506717681885e-05, | |
| "learning_rate": 2.829615010283344e-07, | |
| "loss": 0.0, | |
| "reward": 0.46565248304978013, | |
| "reward_std": 0.8597949855029583, | |
| "rewards/cosine_scaled_reward": -0.006757115945219994, | |
| "rewards/format_reward": 0.47916666977107525, | |
| "step": 367 | |
| }, | |
| { | |
| "completion_length": 2787.354190826416, | |
| "epoch": 0.4205714285714286, | |
| "grad_norm": 0.09145579487085342, | |
| "kl": 2.7474015951156616e-05, | |
| "learning_rate": 2.8043938066798645e-07, | |
| "loss": 0.0, | |
| "reward": 0.060743533074855804, | |
| "reward_std": 0.48769666999578476, | |
| "rewards/cosine_scaled_reward": -0.11546159163117409, | |
| "rewards/format_reward": 0.2916666679084301, | |
| "step": 368 | |
| }, | |
| { | |
| "completion_length": 2893.812530517578, | |
| "epoch": 0.4217142857142857, | |
| "grad_norm": 0.08131539821624756, | |
| "kl": 3.6016106605529785e-05, | |
| "learning_rate": 2.7793039831193133e-07, | |
| "loss": 0.0, | |
| "reward": 0.6848491542041302, | |
| "reward_std": 0.9212348274886608, | |
| "rewards/cosine_scaled_reward": 0.11325790174305439, | |
| "rewards/format_reward": 0.4583333358168602, | |
| "step": 369 | |
| }, | |
| { | |
| "completion_length": 3235.8333740234375, | |
| "epoch": 0.4228571428571429, | |
| "grad_norm": 0.10968395322561264, | |
| "kl": 1.919269561767578e-05, | |
| "learning_rate": 2.7543467624442956e-07, | |
| "loss": 0.0, | |
| "reward": 0.06710641086101532, | |
| "reward_std": 0.4568505547940731, | |
| "rewards/cosine_scaled_reward": -0.06019679829478264, | |
| "rewards/format_reward": 0.18750000186264515, | |
| "step": 370 | |
| }, | |
| { | |
| "completion_length": 1835.979190826416, | |
| "epoch": 0.424, | |
| "grad_norm": 0.12385034561157227, | |
| "kl": 2.6310794055461884e-05, | |
| "learning_rate": 2.729523361034538e-07, | |
| "loss": 0.0, | |
| "reward": 0.6333170644938946, | |
| "reward_std": 0.5320510976016521, | |
| "rewards/cosine_scaled_reward": 0.0041585080325603485, | |
| "rewards/format_reward": 0.6250000037252903, | |
| "step": 371 | |
| }, | |
| { | |
| "completion_length": 3005.5208892822266, | |
| "epoch": 0.42514285714285716, | |
| "grad_norm": 0.05454113334417343, | |
| "kl": 6.309361197054386e-06, | |
| "learning_rate": 2.7048349887476037e-07, | |
| "loss": 0.0, | |
| "reward": 0.6542601608671248, | |
| "reward_std": 0.838243255391717, | |
| "rewards/cosine_scaled_reward": 0.12921340949833393, | |
| "rewards/format_reward": 0.3958333358168602, | |
| "step": 372 | |
| }, | |
| { | |
| "completion_length": 1821.7916984558105, | |
| "epoch": 0.42628571428571427, | |
| "grad_norm": 0.11901376396417618, | |
| "kl": 5.982816219329834e-05, | |
| "learning_rate": 2.6802828488599294e-07, | |
| "loss": 0.0, | |
| "reward": 0.4493227368220687, | |
| "reward_std": 0.5491539463400841, | |
| "rewards/cosine_scaled_reward": -0.08783863391727209, | |
| "rewards/format_reward": 0.625, | |
| "step": 373 | |
| }, | |
| { | |
| "completion_length": 2537.1667251586914, | |
| "epoch": 0.42742857142857144, | |
| "grad_norm": 0.07847942411899567, | |
| "kl": 4.6640634536743164e-05, | |
| "learning_rate": 2.655868138008171e-07, | |
| "loss": 0.0, | |
| "reward": 0.35121756605803967, | |
| "reward_std": 1.0308999605476856, | |
| "rewards/cosine_scaled_reward": -0.0431412230245769, | |
| "rewards/format_reward": 0.43750001303851604, | |
| "step": 374 | |
| }, | |
| { | |
| "completion_length": 3004.8541717529297, | |
| "epoch": 0.42857142857142855, | |
| "grad_norm": 0.07147237658500671, | |
| "kl": 2.2170599550008774e-05, | |
| "learning_rate": 2.631592046130896e-07, | |
| "loss": 0.0, | |
| "reward": 0.44833586869481223, | |
| "reward_std": 0.717572771012783, | |
| "rewards/cosine_scaled_reward": 0.026251256465911865, | |
| "rewards/format_reward": 0.3958333395421505, | |
| "step": 375 | |
| }, | |
| { | |
| "completion_length": 2279.1458740234375, | |
| "epoch": 0.4297142857142857, | |
| "grad_norm": 0.10169682651758194, | |
| "kl": 5.2988529205322266e-05, | |
| "learning_rate": 2.6074557564105724e-07, | |
| "loss": 0.0, | |
| "reward": 0.26139017194509506, | |
| "reward_std": 0.5821227096021175, | |
| "rewards/cosine_scaled_reward": -0.15055493242107332, | |
| "rewards/format_reward": 0.5625000037252903, | |
| "step": 376 | |
| }, | |
| { | |
| "completion_length": 3531.5833740234375, | |
| "epoch": 0.4308571428571429, | |
| "grad_norm": 0.059201430529356, | |
| "kl": 4.011020064353943e-05, | |
| "learning_rate": 2.583460445215911e-07, | |
| "loss": 0.0, | |
| "reward": -0.31784557923674583, | |
| "reward_std": 0.6391811221837997, | |
| "rewards/cosine_scaled_reward": -0.20058946311473846, | |
| "rewards/format_reward": 0.08333333395421505, | |
| "step": 377 | |
| }, | |
| { | |
| "completion_length": 2243.0833644866943, | |
| "epoch": 0.432, | |
| "grad_norm": 0.1986025720834732, | |
| "kl": 5.568191409111023e-05, | |
| "learning_rate": 2.5596072820445254e-07, | |
| "loss": 0.0, | |
| "reward": 0.8121672458946705, | |
| "reward_std": 0.6707101948559284, | |
| "rewards/cosine_scaled_reward": 0.0935836099088192, | |
| "rewards/format_reward": 0.6250000055879354, | |
| "step": 378 | |
| }, | |
| { | |
| "completion_length": 3184.687511444092, | |
| "epoch": 0.43314285714285716, | |
| "grad_norm": 0.08150114864110947, | |
| "kl": 1.6835052520036697e-05, | |
| "learning_rate": 2.5358974294659373e-07, | |
| "loss": 0.0, | |
| "reward": -0.33935508131980896, | |
| "reward_std": 0.5709443464875221, | |
| "rewards/cosine_scaled_reward": -0.2530108750797808, | |
| "rewards/format_reward": 0.1666666679084301, | |
| "step": 379 | |
| }, | |
| { | |
| "completion_length": 2518.4583740234375, | |
| "epoch": 0.4342857142857143, | |
| "grad_norm": 0.07658781111240387, | |
| "kl": 4.0316954255104065e-05, | |
| "learning_rate": 2.512332043064913e-07, | |
| "loss": 0.0, | |
| "reward": 0.27350061014294624, | |
| "reward_std": 0.6866664737462997, | |
| "rewards/cosine_scaled_reward": -0.10283303062897176, | |
| "rewards/format_reward": 0.4791666716337204, | |
| "step": 380 | |
| }, | |
| { | |
| "completion_length": 3001.708354949951, | |
| "epoch": 0.43542857142857144, | |
| "grad_norm": 0.09742611646652222, | |
| "kl": 3.9070844650268555e-05, | |
| "learning_rate": 2.488912271385139e-07, | |
| "loss": 0.0, | |
| "reward": 0.012850694358348846, | |
| "reward_std": 0.676497520878911, | |
| "rewards/cosine_scaled_reward": -0.16024132445454597, | |
| "rewards/format_reward": 0.3333333395421505, | |
| "step": 381 | |
| }, | |
| { | |
| "completion_length": 1947.4375381469727, | |
| "epoch": 0.43657142857142855, | |
| "grad_norm": 0.09814684092998505, | |
| "kl": 2.527981996536255e-05, | |
| "learning_rate": 2.465639255873246e-07, | |
| "loss": 0.0, | |
| "reward": 0.3552222040016204, | |
| "reward_std": 0.6978839002549648, | |
| "rewards/cosine_scaled_reward": -0.1661389044020325, | |
| "rewards/format_reward": 0.6875000055879354, | |
| "step": 382 | |
| }, | |
| { | |
| "completion_length": 2495.437530517578, | |
| "epoch": 0.4377142857142857, | |
| "grad_norm": 0.0902680978178978, | |
| "kl": 5.955994129180908e-05, | |
| "learning_rate": 2.4425141308231765e-07, | |
| "loss": 0.0, | |
| "reward": 0.5551093192771077, | |
| "reward_std": 0.3591675292700529, | |
| "rewards/cosine_scaled_reward": 0.02755466243252158, | |
| "rewards/format_reward": 0.5, | |
| "step": 383 | |
| }, | |
| { | |
| "completion_length": 2272.104202270508, | |
| "epoch": 0.43885714285714283, | |
| "grad_norm": 0.09107795357704163, | |
| "kl": 2.6211142539978027e-05, | |
| "learning_rate": 2.4195380233209006e-07, | |
| "loss": 0.0, | |
| "reward": 1.0653745606541634, | |
| "reward_std": 1.0298114344477654, | |
| "rewards/cosine_scaled_reward": 0.22018728591501713, | |
| "rewards/format_reward": 0.6250000093132257, | |
| "step": 384 | |
| }, | |
| { | |
| "completion_length": 2547.541679382324, | |
| "epoch": 0.44, | |
| "grad_norm": 0.07750263810157776, | |
| "kl": 3.8154772482812405e-05, | |
| "learning_rate": 2.3967120531894857e-07, | |
| "loss": 0.0, | |
| "reward": 0.05561675410717726, | |
| "reward_std": 0.5664078928530216, | |
| "rewards/cosine_scaled_reward": -0.2117749648168683, | |
| "rewards/format_reward": 0.47916666977107525, | |
| "step": 385 | |
| }, | |
| { | |
| "completion_length": 2934.6667098999023, | |
| "epoch": 0.44114285714285717, | |
| "grad_norm": 0.06273584812879562, | |
| "kl": 1.0585412383079529e-05, | |
| "learning_rate": 2.374037332934512e-07, | |
| "loss": 0.0, | |
| "reward": 0.24991119404876372, | |
| "reward_std": 0.641814861446619, | |
| "rewards/cosine_scaled_reward": -0.08337772451341152, | |
| "rewards/format_reward": 0.416666679084301, | |
| "step": 386 | |
| }, | |
| { | |
| "completion_length": 2986.125015258789, | |
| "epoch": 0.4422857142857143, | |
| "grad_norm": 0.08675210922956467, | |
| "kl": 4.296004772186279e-05, | |
| "learning_rate": 2.3515149676898552e-07, | |
| "loss": 0.0, | |
| "reward": -0.17634923849254847, | |
| "reward_std": 0.2983277551829815, | |
| "rewards/cosine_scaled_reward": -0.1923412922769785, | |
| "rewards/format_reward": 0.2083333358168602, | |
| "step": 387 | |
| }, | |
| { | |
| "completion_length": 2441.6875610351562, | |
| "epoch": 0.44342857142857145, | |
| "grad_norm": 0.08258826285600662, | |
| "kl": 1.2867152690887451e-05, | |
| "learning_rate": 2.3291460551638237e-07, | |
| "loss": 0.0, | |
| "reward": 0.5069841798394918, | |
| "reward_std": 0.6721059624105692, | |
| "rewards/cosine_scaled_reward": -0.0069245845079422, | |
| "rewards/format_reward": 0.5208333414047956, | |
| "step": 388 | |
| }, | |
| { | |
| "completion_length": 2687.687568664551, | |
| "epoch": 0.44457142857142856, | |
| "grad_norm": 0.08212108165025711, | |
| "kl": 1.7056241631507874e-05, | |
| "learning_rate": 2.306931685585657e-07, | |
| "loss": 0.0, | |
| "reward": 0.4927669297903776, | |
| "reward_std": 0.6880750022828579, | |
| "rewards/cosine_scaled_reward": -0.014033210929483175, | |
| "rewards/format_reward": 0.520833333954215, | |
| "step": 389 | |
| }, | |
| { | |
| "completion_length": 2899.5416679382324, | |
| "epoch": 0.44571428571428573, | |
| "grad_norm": 0.08406048268079758, | |
| "kl": 1.8338672816753387e-05, | |
| "learning_rate": 2.2848729416523859e-07, | |
| "loss": 0.0, | |
| "reward": 0.18273890623822808, | |
| "reward_std": 0.7410868480801582, | |
| "rewards/cosine_scaled_reward": -0.09613056015223265, | |
| "rewards/format_reward": 0.37500000931322575, | |
| "step": 390 | |
| }, | |
| { | |
| "completion_length": 2676.437557220459, | |
| "epoch": 0.44685714285714284, | |
| "grad_norm": 0.1075756624341011, | |
| "kl": 3.056228160858154e-05, | |
| "learning_rate": 2.2629708984760706e-07, | |
| "loss": 0.0, | |
| "reward": 0.7147546643391252, | |
| "reward_std": 0.9130274280905724, | |
| "rewards/cosine_scaled_reward": 0.10737732611596584, | |
| "rewards/format_reward": 0.5000000093132257, | |
| "step": 391 | |
| }, | |
| { | |
| "completion_length": 2230.1667289733887, | |
| "epoch": 0.448, | |
| "grad_norm": 0.09484854340553284, | |
| "kl": 4.966557025909424e-05, | |
| "learning_rate": 2.2412266235313973e-07, | |
| "loss": 0.0, | |
| "reward": 0.47843369096517563, | |
| "reward_std": 0.7248108424246311, | |
| "rewards/cosine_scaled_reward": -0.0732831540517509, | |
| "rewards/format_reward": 0.6250000074505806, | |
| "step": 392 | |
| }, | |
| { | |
| "completion_length": 2507.0000762939453, | |
| "epoch": 0.4491428571428571, | |
| "grad_norm": 0.06855693459510803, | |
| "kl": 5.726516246795654e-05, | |
| "learning_rate": 2.2196411766036487e-07, | |
| "loss": 0.0, | |
| "reward": 0.3534469548612833, | |
| "reward_std": 0.581486776471138, | |
| "rewards/cosine_scaled_reward": -0.11494319001212716, | |
| "rewards/format_reward": 0.5833333414047956, | |
| "step": 393 | |
| }, | |
| { | |
| "completion_length": 2987.687515258789, | |
| "epoch": 0.4502857142857143, | |
| "grad_norm": 0.06926935166120529, | |
| "kl": 2.713967114686966e-05, | |
| "learning_rate": 2.1982156097370557e-07, | |
| "loss": 0.0, | |
| "reward": -0.11504888162016869, | |
| "reward_std": 0.568249948322773, | |
| "rewards/cosine_scaled_reward": -0.18252443941310048, | |
| "rewards/format_reward": 0.25000000558793545, | |
| "step": 394 | |
| }, | |
| { | |
| "completion_length": 2165.333335876465, | |
| "epoch": 0.4514285714285714, | |
| "grad_norm": 0.1272980123758316, | |
| "kl": 4.277704283595085e-05, | |
| "learning_rate": 2.1769509671835223e-07, | |
| "loss": 0.0, | |
| "reward": 0.22512651607394218, | |
| "reward_std": 0.42349702306091785, | |
| "rewards/cosine_scaled_reward": -0.13743674801662564, | |
| "rewards/format_reward": 0.5, | |
| "step": 395 | |
| }, | |
| { | |
| "completion_length": 2721.2500762939453, | |
| "epoch": 0.45257142857142857, | |
| "grad_norm": 0.06576870381832123, | |
| "kl": 3.52710485458374e-05, | |
| "learning_rate": 2.1558482853517253e-07, | |
| "loss": 0.0, | |
| "reward": 0.2945304214954376, | |
| "reward_std": 0.9794269874691963, | |
| "rewards/cosine_scaled_reward": -0.07148478366434574, | |
| "rewards/format_reward": 0.43750001303851604, | |
| "step": 396 | |
| }, | |
| { | |
| "completion_length": 2926.8958740234375, | |
| "epoch": 0.45371428571428574, | |
| "grad_norm": 0.06013445183634758, | |
| "kl": 2.625398337841034e-06, | |
| "learning_rate": 2.134908592756607e-07, | |
| "loss": 0.0, | |
| "reward": 0.4305594153702259, | |
| "reward_std": 1.155913457274437, | |
| "rewards/cosine_scaled_reward": -0.045136953704059124, | |
| "rewards/format_reward": 0.5208333469927311, | |
| "step": 397 | |
| }, | |
| { | |
| "completion_length": 2711.6458435058594, | |
| "epoch": 0.45485714285714285, | |
| "grad_norm": 0.07058101147413254, | |
| "kl": 2.446770668029785e-05, | |
| "learning_rate": 2.1141329099692406e-07, | |
| "loss": 0.0, | |
| "reward": 0.3636575024574995, | |
| "reward_std": 0.9166036918759346, | |
| "rewards/cosine_scaled_reward": -0.06817125831730664, | |
| "rewards/format_reward": 0.5000000074505806, | |
| "step": 398 | |
| }, | |
| { | |
| "completion_length": 2178.4583892822266, | |
| "epoch": 0.456, | |
| "grad_norm": 0.08235453069210052, | |
| "kl": 3.477092832326889e-05, | |
| "learning_rate": 2.0935222495670968e-07, | |
| "loss": 0.0, | |
| "reward": 0.6000682711601257, | |
| "reward_std": 0.8982837274670601, | |
| "rewards/cosine_scaled_reward": -0.012465879321098328, | |
| "rewards/format_reward": 0.6250000074505806, | |
| "step": 399 | |
| }, | |
| { | |
| "completion_length": 1764.5208587646484, | |
| "epoch": 0.45714285714285713, | |
| "grad_norm": 0.1015034094452858, | |
| "kl": 6.362050771713257e-05, | |
| "learning_rate": 2.0730776160846853e-07, | |
| "loss": 0.0, | |
| "reward": 0.911169296130538, | |
| "reward_std": 0.8766638562083244, | |
| "rewards/cosine_scaled_reward": 0.09100130602018908, | |
| "rewards/format_reward": 0.7291666679084301, | |
| "step": 400 | |
| }, | |
| { | |
| "completion_length": 3018.5833587646484, | |
| "epoch": 0.4582857142857143, | |
| "grad_norm": 0.062130190432071686, | |
| "kl": 4.485994577407837e-05, | |
| "learning_rate": 2.0528000059645995e-07, | |
| "loss": 0.0, | |
| "reward": 0.284901387989521, | |
| "reward_std": 0.3474708981812, | |
| "rewards/cosine_scaled_reward": -0.04504931718111038, | |
| "rewards/format_reward": 0.375, | |
| "step": 401 | |
| }, | |
| { | |
| "completion_length": 2477.8125534057617, | |
| "epoch": 0.4594285714285714, | |
| "grad_norm": 0.10501542687416077, | |
| "kl": 4.5987311750650406e-05, | |
| "learning_rate": 2.032690407508949e-07, | |
| "loss": 0.0, | |
| "reward": 0.5193133354187012, | |
| "reward_std": 0.4482303988188505, | |
| "rewards/cosine_scaled_reward": -0.0007600053213536739, | |
| "rewards/format_reward": 0.5208333358168602, | |
| "step": 402 | |
| }, | |
| { | |
| "completion_length": 1963.25004196167, | |
| "epoch": 0.4605714285714286, | |
| "grad_norm": 0.10443693399429321, | |
| "kl": 4.4220127165317535e-05, | |
| "learning_rate": 2.0127498008311922e-07, | |
| "loss": 0.0, | |
| "reward": 0.6804529801011086, | |
| "reward_std": 0.5294753350317478, | |
| "rewards/cosine_scaled_reward": -0.024356848560273647, | |
| "rewards/format_reward": 0.7291666716337204, | |
| "step": 403 | |
| }, | |
| { | |
| "completion_length": 2248.8125228881836, | |
| "epoch": 0.4617142857142857, | |
| "grad_norm": 0.10749242454767227, | |
| "kl": 3.7647783756256104e-05, | |
| "learning_rate": 1.9929791578083655e-07, | |
| "loss": 0.0, | |
| "reward": 0.4887885432690382, | |
| "reward_std": 0.6861027367413044, | |
| "rewards/cosine_scaled_reward": -0.02643907070159912, | |
| "rewards/format_reward": 0.5416666734963655, | |
| "step": 404 | |
| }, | |
| { | |
| "completion_length": 2357.5000343322754, | |
| "epoch": 0.46285714285714286, | |
| "grad_norm": 0.10331033170223236, | |
| "kl": 4.6698376536369324e-05, | |
| "learning_rate": 1.9733794420337213e-07, | |
| "loss": 0.0, | |
| "reward": 0.7133696104865521, | |
| "reward_std": 0.6605603937059641, | |
| "rewards/cosine_scaled_reward": 0.09626812860369682, | |
| "rewards/format_reward": 0.5208333376795053, | |
| "step": 405 | |
| }, | |
| { | |
| "completion_length": 2476.2291870117188, | |
| "epoch": 0.464, | |
| "grad_norm": 0.07106567174196243, | |
| "kl": 3.6872923374176025e-05, | |
| "learning_rate": 1.9539516087697517e-07, | |
| "loss": 0.0, | |
| "reward": 0.5355893382802606, | |
| "reward_std": 1.01215098798275, | |
| "rewards/cosine_scaled_reward": -0.0342886745929718, | |
| "rewards/format_reward": 0.6041666772216558, | |
| "step": 406 | |
| }, | |
| { | |
| "completion_length": 2483.458351135254, | |
| "epoch": 0.46514285714285714, | |
| "grad_norm": 0.09187468141317368, | |
| "kl": 2.3949891328811646e-05, | |
| "learning_rate": 1.934696604901642e-07, | |
| "loss": 0.0, | |
| "reward": 0.16183810401707888, | |
| "reward_std": 0.5184289030730724, | |
| "rewards/cosine_scaled_reward": -0.12741428334265947, | |
| "rewards/format_reward": 0.4166666679084301, | |
| "step": 407 | |
| }, | |
| { | |
| "completion_length": 2792.062557220459, | |
| "epoch": 0.4662857142857143, | |
| "grad_norm": 0.07306360453367233, | |
| "kl": 3.31290066242218e-05, | |
| "learning_rate": 1.915615368891117e-07, | |
| "loss": 0.0, | |
| "reward": 0.4886945236939937, | |
| "reward_std": 0.8505336567759514, | |
| "rewards/cosine_scaled_reward": 0.015180593356490135, | |
| "rewards/format_reward": 0.4583333469927311, | |
| "step": 408 | |
| }, | |
| { | |
| "completion_length": 3545.9166870117188, | |
| "epoch": 0.4674285714285714, | |
| "grad_norm": 0.051210444420576096, | |
| "kl": -5.498528480529785e-06, | |
| "learning_rate": 1.8967088307307e-07, | |
| "loss": -0.0, | |
| "reward": -0.2515447251498699, | |
| "reward_std": 0.5099538285285234, | |
| "rewards/cosine_scaled_reward": -0.17785569466650486, | |
| "rewards/format_reward": 0.1041666679084301, | |
| "step": 409 | |
| }, | |
| { | |
| "completion_length": 2458.895851135254, | |
| "epoch": 0.4685714285714286, | |
| "grad_norm": 0.09167163074016571, | |
| "kl": 3.5781413316726685e-05, | |
| "learning_rate": 1.8779779118983867e-07, | |
| "loss": 0.0, | |
| "reward": 0.4632798433303833, | |
| "reward_std": 0.6468245275318623, | |
| "rewards/cosine_scaled_reward": -0.04961008578538895, | |
| "rewards/format_reward": 0.5625000074505806, | |
| "step": 410 | |
| }, | |
| { | |
| "completion_length": 3053.4792098999023, | |
| "epoch": 0.4697142857142857, | |
| "grad_norm": 0.07261925935745239, | |
| "kl": -5.988404154777527e-06, | |
| "learning_rate": 1.8594235253127372e-07, | |
| "loss": -0.0, | |
| "reward": -0.015555396676063538, | |
| "reward_std": 0.5620577465742826, | |
| "rewards/cosine_scaled_reward": -0.153611047193408, | |
| "rewards/format_reward": 0.29166667349636555, | |
| "step": 411 | |
| }, | |
| { | |
| "completion_length": 2753.208396911621, | |
| "epoch": 0.47085714285714286, | |
| "grad_norm": 0.0830722451210022, | |
| "kl": 8.582323789596558e-05, | |
| "learning_rate": 1.8410465752883758e-07, | |
| "loss": 0.0, | |
| "reward": 0.35114453732967377, | |
| "reward_std": 0.8256996236741543, | |
| "rewards/cosine_scaled_reward": -0.04317775974050164, | |
| "rewards/format_reward": 0.43750000558793545, | |
| "step": 412 | |
| }, | |
| { | |
| "completion_length": 2613.208366394043, | |
| "epoch": 0.472, | |
| "grad_norm": 0.07420708984136581, | |
| "kl": 1.8894672393798828e-05, | |
| "learning_rate": 1.822847957491922e-07, | |
| "loss": 0.0, | |
| "reward": 0.7368655144236982, | |
| "reward_std": 0.7150338962674141, | |
| "rewards/cosine_scaled_reward": 0.10801609046757221, | |
| "rewards/format_reward": 0.5208333395421505, | |
| "step": 413 | |
| }, | |
| { | |
| "completion_length": 2925.916679382324, | |
| "epoch": 0.47314285714285714, | |
| "grad_norm": 0.07015778869390488, | |
| "kl": 1.4710240066051483e-05, | |
| "learning_rate": 1.804828558898332e-07, | |
| "loss": 0.0, | |
| "reward": 0.008438636228675023, | |
| "reward_std": 0.705296516418457, | |
| "rewards/cosine_scaled_reward": -0.1520306970924139, | |
| "rewards/format_reward": 0.3125000037252903, | |
| "step": 414 | |
| }, | |
| { | |
| "completion_length": 3225.750015258789, | |
| "epoch": 0.4742857142857143, | |
| "grad_norm": 0.05689867213368416, | |
| "kl": 7.543712854385376e-05, | |
| "learning_rate": 1.7869892577476722e-07, | |
| "loss": 0.0, | |
| "reward": -0.16249761963263154, | |
| "reward_std": 0.5054702050983906, | |
| "rewards/cosine_scaled_reward": -0.1541654784232378, | |
| "rewards/format_reward": 0.14583333395421505, | |
| "step": 415 | |
| }, | |
| { | |
| "completion_length": 1772.8125381469727, | |
| "epoch": 0.4754285714285714, | |
| "grad_norm": 0.10376486927270889, | |
| "kl": 1.5255063772201538e-05, | |
| "learning_rate": 1.7693309235023127e-07, | |
| "loss": 0.0, | |
| "reward": 0.597583282738924, | |
| "reward_std": 0.5255019888281822, | |
| "rewards/cosine_scaled_reward": -0.08662503119558096, | |
| "rewards/format_reward": 0.770833333954215, | |
| "step": 416 | |
| }, | |
| { | |
| "completion_length": 3272.0208435058594, | |
| "epoch": 0.4765714285714286, | |
| "grad_norm": 0.07316503673791885, | |
| "kl": 8.390354923903942e-05, | |
| "learning_rate": 1.7518544168045524e-07, | |
| "loss": 0.0, | |
| "reward": -0.3171197446063161, | |
| "reward_std": 0.5248132422566414, | |
| "rewards/cosine_scaled_reward": -0.2731432057917118, | |
| "rewards/format_reward": 0.2291666679084301, | |
| "step": 417 | |
| }, | |
| { | |
| "completion_length": 2226.0208473205566, | |
| "epoch": 0.4777142857142857, | |
| "grad_norm": 0.1070566475391388, | |
| "kl": 4.1857361793518066e-05, | |
| "learning_rate": 1.7345605894346726e-07, | |
| "loss": 0.0, | |
| "reward": 0.5180414142087102, | |
| "reward_std": 0.45487212389707565, | |
| "rewards/cosine_scaled_reward": -0.03264596685767174, | |
| "rewards/format_reward": 0.5833333414047956, | |
| "step": 418 | |
| }, | |
| { | |
| "completion_length": 2562.416679382324, | |
| "epoch": 0.47885714285714287, | |
| "grad_norm": 0.08295956999063492, | |
| "kl": 4.1775405406951904e-05, | |
| "learning_rate": 1.7174502842694212e-07, | |
| "loss": 0.0, | |
| "reward": 0.24269556999206543, | |
| "reward_std": 0.5556976869702339, | |
| "rewards/cosine_scaled_reward": -0.08698555268347263, | |
| "rewards/format_reward": 0.4166666679084301, | |
| "step": 419 | |
| }, | |
| { | |
| "completion_length": 1584.562557220459, | |
| "epoch": 0.48, | |
| "grad_norm": 0.12170570343732834, | |
| "kl": 6.661564111709595e-05, | |
| "learning_rate": 1.7005243352409333e-07, | |
| "loss": 0.0, | |
| "reward": 0.5661385543644428, | |
| "reward_std": 0.7297745365649462, | |
| "rewards/cosine_scaled_reward": -0.11276404978707433, | |
| "rewards/format_reward": 0.7916666716337204, | |
| "step": 420 | |
| }, | |
| { | |
| "completion_length": 3190.9166717529297, | |
| "epoch": 0.48114285714285715, | |
| "grad_norm": 0.07585594803094864, | |
| "kl": 2.6751309633255005e-05, | |
| "learning_rate": 1.6837835672960831e-07, | |
| "loss": 0.0, | |
| "reward": 0.029675345867872238, | |
| "reward_std": 0.6505759414285421, | |
| "rewards/cosine_scaled_reward": -0.11016233265399933, | |
| "rewards/format_reward": 0.25000000186264515, | |
| "step": 421 | |
| }, | |
| { | |
| "completion_length": 2915.562530517578, | |
| "epoch": 0.48228571428571426, | |
| "grad_norm": 0.06772468239068985, | |
| "kl": 2.381950616836548e-05, | |
| "learning_rate": 1.6672287963562852e-07, | |
| "loss": 0.0, | |
| "reward": 0.13448743242770433, | |
| "reward_std": 0.6184603478759527, | |
| "rewards/cosine_scaled_reward": -0.08900628052651882, | |
| "rewards/format_reward": 0.3125000074505806, | |
| "step": 422 | |
| }, | |
| { | |
| "completion_length": 2960.3333435058594, | |
| "epoch": 0.48342857142857143, | |
| "grad_norm": 0.06626241654157639, | |
| "kl": 1.325458288192749e-05, | |
| "learning_rate": 1.6508608292777203e-07, | |
| "loss": 0.0, | |
| "reward": -0.000947561115026474, | |
| "reward_std": 0.46878720074892044, | |
| "rewards/cosine_scaled_reward": -0.12547378800809383, | |
| "rewards/format_reward": 0.25, | |
| "step": 423 | |
| }, | |
| { | |
| "completion_length": 3068.291717529297, | |
| "epoch": 0.4845714285714286, | |
| "grad_norm": 0.09541390836238861, | |
| "kl": 5.204416811466217e-05, | |
| "learning_rate": 1.6346804638120098e-07, | |
| "loss": 0.0, | |
| "reward": -0.12997506809188053, | |
| "reward_std": 0.6074860319495201, | |
| "rewards/cosine_scaled_reward": -0.18998754490166903, | |
| "rewards/format_reward": 0.2500000037252903, | |
| "step": 424 | |
| }, | |
| { | |
| "completion_length": 2152.1250076293945, | |
| "epoch": 0.4857142857142857, | |
| "grad_norm": 0.07230471819639206, | |
| "kl": 1.7229467630386353e-05, | |
| "learning_rate": 1.6186884885673413e-07, | |
| "loss": 0.0, | |
| "reward": 1.4645886905491352, | |
| "reward_std": 0.9418856333941221, | |
| "rewards/cosine_scaled_reward": 0.3364610277931206, | |
| "rewards/format_reward": 0.7916666772216558, | |
| "step": 425 | |
| }, | |
| { | |
| "completion_length": 2156.5000076293945, | |
| "epoch": 0.4868571428571429, | |
| "grad_norm": 0.11394708603620529, | |
| "kl": 4.618475213646889e-05, | |
| "learning_rate": 1.6028856829700258e-07, | |
| "loss": 0.0, | |
| "reward": 0.5923850163817406, | |
| "reward_std": 0.6649977043271065, | |
| "rewards/cosine_scaled_reward": -0.0058908406645059586, | |
| "rewards/format_reward": 0.6041666716337204, | |
| "step": 426 | |
| }, | |
| { | |
| "completion_length": 3224.875030517578, | |
| "epoch": 0.488, | |
| "grad_norm": 0.06361094117164612, | |
| "kl": 1.4394521713256836e-05, | |
| "learning_rate": 1.5872728172265146e-07, | |
| "loss": 0.0, | |
| "reward": 0.03907743562012911, | |
| "reward_std": 0.7518858462572098, | |
| "rewards/cosine_scaled_reward": -0.08462795615196228, | |
| "rewards/format_reward": 0.2083333358168602, | |
| "step": 427 | |
| }, | |
| { | |
| "completion_length": 2353.7709045410156, | |
| "epoch": 0.48914285714285716, | |
| "grad_norm": 0.09727543592453003, | |
| "kl": 5.46872615814209e-05, | |
| "learning_rate": 1.5718506522858572e-07, | |
| "loss": 0.0, | |
| "reward": 0.2901844955049455, | |
| "reward_std": 0.8400542363524437, | |
| "rewards/cosine_scaled_reward": -0.1361577689240221, | |
| "rewards/format_reward": 0.5625000093132257, | |
| "step": 428 | |
| }, | |
| { | |
| "completion_length": 2253.0833740234375, | |
| "epoch": 0.49028571428571427, | |
| "grad_norm": 0.08973829448223114, | |
| "kl": 7.465481758117676e-05, | |
| "learning_rate": 1.5566199398026147e-07, | |
| "loss": 0.0, | |
| "reward": 0.4184648059308529, | |
| "reward_std": 0.8010416626930237, | |
| "rewards/cosine_scaled_reward": -0.08243425190448761, | |
| "rewards/format_reward": 0.5833333395421505, | |
| "step": 429 | |
| }, | |
| { | |
| "completion_length": 2549.7083587646484, | |
| "epoch": 0.49142857142857144, | |
| "grad_norm": 0.06995910406112671, | |
| "kl": 9.454786777496338e-06, | |
| "learning_rate": 1.5415814221002265e-07, | |
| "loss": 0.0, | |
| "reward": 0.38695547729730606, | |
| "reward_std": 0.7342293374240398, | |
| "rewards/cosine_scaled_reward": -0.07735559809952974, | |
| "rewards/format_reward": 0.5416666697710752, | |
| "step": 430 | |
| }, | |
| { | |
| "completion_length": 2568.2916946411133, | |
| "epoch": 0.49257142857142855, | |
| "grad_norm": 0.0774778425693512, | |
| "kl": 6.29723072052002e-05, | |
| "learning_rate": 1.5267358321348285e-07, | |
| "loss": 0.0, | |
| "reward": 0.25465894117951393, | |
| "reward_std": 0.6233883164823055, | |
| "rewards/cosine_scaled_reward": -0.09142053499817848, | |
| "rewards/format_reward": 0.4375000074505806, | |
| "step": 431 | |
| }, | |
| { | |
| "completion_length": 2978.562515258789, | |
| "epoch": 0.4937142857142857, | |
| "grad_norm": 0.0629786029458046, | |
| "kl": 4.05237078666687e-05, | |
| "learning_rate": 1.5120838934595337e-07, | |
| "loss": 0.0, | |
| "reward": 0.1960265375673771, | |
| "reward_std": 0.5580219030380249, | |
| "rewards/cosine_scaled_reward": -0.05823674378916621, | |
| "rewards/format_reward": 0.31250000186264515, | |
| "step": 432 | |
| }, | |
| { | |
| "completion_length": 2684.458366394043, | |
| "epoch": 0.4948571428571429, | |
| "grad_norm": 0.06860741972923279, | |
| "kl": 2.547353506088257e-05, | |
| "learning_rate": 1.4976263201891613e-07, | |
| "loss": 0.0, | |
| "reward": 0.6179636809974909, | |
| "reward_std": 0.7890575006604195, | |
| "rewards/cosine_scaled_reward": 0.07981517910957336, | |
| "rewards/format_reward": 0.4583333395421505, | |
| "step": 433 | |
| }, | |
| { | |
| "completion_length": 2930.3541870117188, | |
| "epoch": 0.496, | |
| "grad_norm": 0.10294987261295319, | |
| "kl": 8.677318692207336e-05, | |
| "learning_rate": 1.483363816965435e-07, | |
| "loss": 0.0, | |
| "reward": -0.20126579143106937, | |
| "reward_std": 0.3846661662682891, | |
| "rewards/cosine_scaled_reward": -0.2568828947842121, | |
| "rewards/format_reward": 0.3125000037252903, | |
| "step": 434 | |
| }, | |
| { | |
| "completion_length": 2329.312568664551, | |
| "epoch": 0.49714285714285716, | |
| "grad_norm": 0.10973911732435226, | |
| "kl": 8.010119199752808e-05, | |
| "learning_rate": 1.469297078922642e-07, | |
| "loss": 0.0, | |
| "reward": 0.39892143197357655, | |
| "reward_std": 0.5416516847908497, | |
| "rewards/cosine_scaled_reward": -0.07137262634932995, | |
| "rewards/format_reward": 0.5416666734963655, | |
| "step": 435 | |
| }, | |
| { | |
| "completion_length": 2192.229190826416, | |
| "epoch": 0.4982857142857143, | |
| "grad_norm": 0.10160942375659943, | |
| "kl": 4.024803638458252e-05, | |
| "learning_rate": 1.4554267916537495e-07, | |
| "loss": 0.0, | |
| "reward": 0.6950225085020065, | |
| "reward_std": 0.786985732614994, | |
| "rewards/cosine_scaled_reward": 0.055844588205218315, | |
| "rewards/format_reward": 0.5833333414047956, | |
| "step": 436 | |
| }, | |
| { | |
| "completion_length": 2709.791702270508, | |
| "epoch": 0.49942857142857144, | |
| "grad_norm": 0.06406563520431519, | |
| "kl": 1.9855797290802002e-05, | |
| "learning_rate": 1.4417536311769885e-07, | |
| "loss": 0.0, | |
| "reward": -0.04634993802756071, | |
| "reward_std": 0.5686388742178679, | |
| "rewards/cosine_scaled_reward": -0.2523416392505169, | |
| "rewards/format_reward": 0.4583333432674408, | |
| "step": 437 | |
| }, | |
| { | |
| "completion_length": 2973.583335876465, | |
| "epoch": 0.5005714285714286, | |
| "grad_norm": 0.06718044728040695, | |
| "kl": 1.529604196548462e-05, | |
| "learning_rate": 1.4282782639029128e-07, | |
| "loss": 0.0, | |
| "reward": -0.10987140890210867, | |
| "reward_std": 0.647129736840725, | |
| "rewards/cosine_scaled_reward": -0.19035237049683928, | |
| "rewards/format_reward": 0.27083333767950535, | |
| "step": 438 | |
| }, | |
| { | |
| "completion_length": 2368.270866394043, | |
| "epoch": 0.5017142857142857, | |
| "grad_norm": 0.07579582184553146, | |
| "kl": 5.852663889527321e-05, | |
| "learning_rate": 1.4150013466019114e-07, | |
| "loss": 0.0, | |
| "reward": 0.13639006949961185, | |
| "reward_std": 0.634217020124197, | |
| "rewards/cosine_scaled_reward": -0.18180496850982308, | |
| "rewards/format_reward": 0.5000000018626451, | |
| "step": 439 | |
| }, | |
| { | |
| "completion_length": 2957.3541717529297, | |
| "epoch": 0.5028571428571429, | |
| "grad_norm": 0.07545099407434464, | |
| "kl": 3.7181656807661057e-05, | |
| "learning_rate": 1.4019235263722034e-07, | |
| "loss": 0.0, | |
| "reward": -0.3488370534032583, | |
| "reward_std": 0.3910331390798092, | |
| "rewards/cosine_scaled_reward": -0.2785852048546076, | |
| "rewards/format_reward": 0.2083333358168602, | |
| "step": 440 | |
| }, | |
| { | |
| "completion_length": 2997.2916870117188, | |
| "epoch": 0.504, | |
| "grad_norm": 0.07434091717004776, | |
| "kl": 5.7131052017211914e-05, | |
| "learning_rate": 1.3890454406082956e-07, | |
| "loss": 0.0, | |
| "reward": 0.1670287363231182, | |
| "reward_std": 0.7089552786201239, | |
| "rewards/cosine_scaled_reward": -0.062318971613422036, | |
| "rewards/format_reward": 0.2916666716337204, | |
| "step": 441 | |
| }, | |
| { | |
| "completion_length": 2926.1666870117188, | |
| "epoch": 0.5051428571428571, | |
| "grad_norm": 0.07139282673597336, | |
| "kl": 9.782612323760986e-06, | |
| "learning_rate": 1.3763677169699217e-07, | |
| "loss": 0.0, | |
| "reward": 0.10981103032827377, | |
| "reward_std": 0.6779932491481304, | |
| "rewards/cosine_scaled_reward": -0.10134449601173401, | |
| "rewards/format_reward": 0.3125000074505806, | |
| "step": 442 | |
| }, | |
| { | |
| "completion_length": 3128.0208435058594, | |
| "epoch": 0.5062857142857143, | |
| "grad_norm": 0.06477571278810501, | |
| "kl": -5.7220458984375e-06, | |
| "learning_rate": 1.3638909733514452e-07, | |
| "loss": -0.0, | |
| "reward": 0.27442847611382604, | |
| "reward_std": 0.7668914757668972, | |
| "rewards/cosine_scaled_reward": -0.029452435206621885, | |
| "rewards/format_reward": 0.3333333395421505, | |
| "step": 443 | |
| }, | |
| { | |
| "completion_length": 2890.0625076293945, | |
| "epoch": 0.5074285714285715, | |
| "grad_norm": 0.07404446601867676, | |
| "kl": 1.334305852651596e-05, | |
| "learning_rate": 1.351615817851748e-07, | |
| "loss": 0.0, | |
| "reward": 0.0024630650877952576, | |
| "reward_std": 0.5683269854635, | |
| "rewards/cosine_scaled_reward": -0.14460179908201098, | |
| "rewards/format_reward": 0.2916666679084301, | |
| "step": 444 | |
| }, | |
| { | |
| "completion_length": 3076.1875228881836, | |
| "epoch": 0.5085714285714286, | |
| "grad_norm": 0.06191933900117874, | |
| "kl": 3.581121563911438e-05, | |
| "learning_rate": 1.3395428487445914e-07, | |
| "loss": 0.0, | |
| "reward": 0.31515196431428194, | |
| "reward_std": 0.8616945073008537, | |
| "rewards/cosine_scaled_reward": -0.009090693667531013, | |
| "rewards/format_reward": 0.3333333395421505, | |
| "step": 445 | |
| }, | |
| { | |
| "completion_length": 2881.666702270508, | |
| "epoch": 0.5097142857142857, | |
| "grad_norm": 0.06932635605335236, | |
| "kl": 3.975396975874901e-05, | |
| "learning_rate": 1.3276726544494571e-07, | |
| "loss": 0.0, | |
| "reward": 0.10830593202263117, | |
| "reward_std": 0.6627788320183754, | |
| "rewards/cosine_scaled_reward": -0.12293036445043981, | |
| "rewards/format_reward": 0.35416666977107525, | |
| "step": 446 | |
| }, | |
| { | |
| "completion_length": 2267.000045776367, | |
| "epoch": 0.5108571428571429, | |
| "grad_norm": 0.08983408659696579, | |
| "kl": 4.3801963329315186e-05, | |
| "learning_rate": 1.316005813502869e-07, | |
| "loss": 0.0, | |
| "reward": 0.5674593984149396, | |
| "reward_std": 0.7652163729071617, | |
| "rewards/cosine_scaled_reward": -0.03918697941116989, | |
| "rewards/format_reward": 0.6458333488553762, | |
| "step": 447 | |
| }, | |
| { | |
| "completion_length": 2121.4583435058594, | |
| "epoch": 0.512, | |
| "grad_norm": 0.11666633933782578, | |
| "kl": 0.00010218843817710876, | |
| "learning_rate": 1.3045428945301953e-07, | |
| "loss": 0.0, | |
| "reward": 0.41809410601854324, | |
| "reward_std": 0.3777278680354357, | |
| "rewards/cosine_scaled_reward": -0.05136961303651333, | |
| "rewards/format_reward": 0.520833333954215, | |
| "step": 448 | |
| }, | |
| { | |
| "completion_length": 2729.708351135254, | |
| "epoch": 0.5131428571428571, | |
| "grad_norm": 0.07791003584861755, | |
| "kl": 3.200024366378784e-05, | |
| "learning_rate": 1.2932844562179352e-07, | |
| "loss": 0.0, | |
| "reward": 0.1269844751805067, | |
| "reward_std": 0.5874846428632736, | |
| "rewards/cosine_scaled_reward": -0.12400775775313377, | |
| "rewards/format_reward": 0.37500000558793545, | |
| "step": 449 | |
| }, | |
| { | |
| "completion_length": 2512.770835876465, | |
| "epoch": 0.5142857142857142, | |
| "grad_norm": 0.07424568384885788, | |
| "kl": 4.151836037635803e-05, | |
| "learning_rate": 1.2822310472864885e-07, | |
| "loss": 0.0, | |
| "reward": 0.144259762018919, | |
| "reward_std": 0.5727597586810589, | |
| "rewards/cosine_scaled_reward": -0.12578679248690605, | |
| "rewards/format_reward": 0.3958333395421505, | |
| "step": 450 | |
| }, | |
| { | |
| "completion_length": 2684.7083435058594, | |
| "epoch": 0.5154285714285715, | |
| "grad_norm": 0.10009966790676117, | |
| "kl": 3.201141953468323e-05, | |
| "learning_rate": 1.2713832064634125e-07, | |
| "loss": 0.0, | |
| "reward": 0.34645023569464684, | |
| "reward_std": 0.4051150921732187, | |
| "rewards/cosine_scaled_reward": -0.003858224954456091, | |
| "rewards/format_reward": 0.3541666716337204, | |
| "step": 451 | |
| }, | |
| { | |
| "completion_length": 3349.1875610351562, | |
| "epoch": 0.5165714285714286, | |
| "grad_norm": 0.05349632352590561, | |
| "kl": -4.492700099945068e-06, | |
| "learning_rate": 1.260741462457165e-07, | |
| "loss": -0.0, | |
| "reward": 0.3489017002284527, | |
| "reward_std": 0.8299882672727108, | |
| "rewards/cosine_scaled_reward": 0.028617504984140396, | |
| "rewards/format_reward": 0.2916666679084301, | |
| "step": 452 | |
| }, | |
| { | |
| "completion_length": 2467.5833587646484, | |
| "epoch": 0.5177142857142857, | |
| "grad_norm": 0.11451800912618637, | |
| "kl": 6.0811013099737465e-05, | |
| "learning_rate": 1.2503063339313356e-07, | |
| "loss": 0.0, | |
| "reward": 0.23246023803949356, | |
| "reward_std": 0.5281177684664726, | |
| "rewards/cosine_scaled_reward": -0.15460322797298431, | |
| "rewards/format_reward": 0.5416666716337204, | |
| "step": 453 | |
| }, | |
| { | |
| "completion_length": 2652.9375534057617, | |
| "epoch": 0.5188571428571429, | |
| "grad_norm": 0.06591752916574478, | |
| "kl": 2.1766871213912964e-05, | |
| "learning_rate": 1.2400783294793668e-07, | |
| "loss": 0.0, | |
| "reward": 0.1991555169224739, | |
| "reward_std": 0.7391630746424198, | |
| "rewards/cosine_scaled_reward": -0.15042224945500493, | |
| "rewards/format_reward": 0.5000000093132257, | |
| "step": 454 | |
| }, | |
| { | |
| "completion_length": 3100.270835876465, | |
| "epoch": 0.52, | |
| "grad_norm": 0.08253812789916992, | |
| "kl": 2.0118430256843567e-05, | |
| "learning_rate": 1.2300579475997657e-07, | |
| "loss": 0.0, | |
| "reward": -0.32044728845357895, | |
| "reward_std": 0.463680999353528, | |
| "rewards/cosine_scaled_reward": -0.26439031958580017, | |
| "rewards/format_reward": 0.20833333395421505, | |
| "step": 455 | |
| }, | |
| { | |
| "completion_length": 3148.3541870117188, | |
| "epoch": 0.5211428571428571, | |
| "grad_norm": 0.06728158891201019, | |
| "kl": 8.217990398406982e-06, | |
| "learning_rate": 1.220245676671809e-07, | |
| "loss": 0.0, | |
| "reward": -0.17758686933666468, | |
| "reward_std": 0.5700683370232582, | |
| "rewards/cosine_scaled_reward": -0.20337677374482155, | |
| "rewards/format_reward": 0.2291666679084301, | |
| "step": 456 | |
| }, | |
| { | |
| "completion_length": 3049.250045776367, | |
| "epoch": 0.5222857142857142, | |
| "grad_norm": 0.07662766426801682, | |
| "kl": 4.097074270248413e-05, | |
| "learning_rate": 1.2106419949317388e-07, | |
| "loss": 0.0, | |
| "reward": 0.14775238651782274, | |
| "reward_std": 0.8221618942916393, | |
| "rewards/cosine_scaled_reward": -0.05112380969512742, | |
| "rewards/format_reward": 0.25000000186264515, | |
| "step": 457 | |
| }, | |
| { | |
| "completion_length": 2331.708339691162, | |
| "epoch": 0.5234285714285715, | |
| "grad_norm": 0.10992821305990219, | |
| "kl": 3.629177808761597e-05, | |
| "learning_rate": 1.2012473704494537e-07, | |
| "loss": 0.0, | |
| "reward": 0.13964971527457237, | |
| "reward_std": 0.3678394239395857, | |
| "rewards/cosine_scaled_reward": -0.18017515260726213, | |
| "rewards/format_reward": 0.5, | |
| "step": 458 | |
| }, | |
| { | |
| "completion_length": 1773.1041946411133, | |
| "epoch": 0.5245714285714286, | |
| "grad_norm": 0.11801594495773315, | |
| "kl": 4.814937710762024e-05, | |
| "learning_rate": 1.1920622611056974e-07, | |
| "loss": 0.0, | |
| "reward": 1.0043286234140396, | |
| "reward_std": 0.9032549224793911, | |
| "rewards/cosine_scaled_reward": 0.12716429959982634, | |
| "rewards/format_reward": 0.7500000055879354, | |
| "step": 459 | |
| }, | |
| { | |
| "completion_length": 3080.8958892822266, | |
| "epoch": 0.5257142857142857, | |
| "grad_norm": 0.06654898822307587, | |
| "kl": 3.0407682061195374e-06, | |
| "learning_rate": 1.1830871145697412e-07, | |
| "loss": -0.0, | |
| "reward": 0.06573297083377838, | |
| "reward_std": 0.6864577080123127, | |
| "rewards/cosine_scaled_reward": -0.1546335220336914, | |
| "rewards/format_reward": 0.3750000037252903, | |
| "step": 460 | |
| }, | |
| { | |
| "completion_length": 2982.062515258789, | |
| "epoch": 0.5268571428571428, | |
| "grad_norm": 0.06870046257972717, | |
| "kl": 3.9667822420597076e-05, | |
| "learning_rate": 1.1743223682775649e-07, | |
| "loss": 0.0, | |
| "reward": -0.030075288377702236, | |
| "reward_std": 0.7001252397894859, | |
| "rewards/cosine_scaled_reward": -0.15045431395992637, | |
| "rewards/format_reward": 0.2708333395421505, | |
| "step": 461 | |
| }, | |
| { | |
| "completion_length": 2850.2916717529297, | |
| "epoch": 0.528, | |
| "grad_norm": 0.07281744480133057, | |
| "kl": 2.4617649614810944e-05, | |
| "learning_rate": 1.1657684494105386e-07, | |
| "loss": 0.0, | |
| "reward": -0.24683623388409615, | |
| "reward_std": 0.3418162725865841, | |
| "rewards/cosine_scaled_reward": -0.24841812415979803, | |
| "rewards/format_reward": 0.25, | |
| "step": 462 | |
| }, | |
| { | |
| "completion_length": 2778.166702270508, | |
| "epoch": 0.5291428571428571, | |
| "grad_norm": 0.057184841483831406, | |
| "kl": 2.6188790798187256e-05, | |
| "learning_rate": 1.1574257748745986e-07, | |
| "loss": 0.0, | |
| "reward": 0.23532123491168022, | |
| "reward_std": 1.001348927617073, | |
| "rewards/cosine_scaled_reward": -0.09067271713865921, | |
| "rewards/format_reward": 0.4166666716337204, | |
| "step": 463 | |
| }, | |
| { | |
| "completion_length": 1686.6250228881836, | |
| "epoch": 0.5302857142857142, | |
| "grad_norm": 0.10833612084388733, | |
| "kl": 4.209578037261963e-05, | |
| "learning_rate": 1.1492947512799328e-07, | |
| "loss": 0.0, | |
| "reward": 0.946430669631809, | |
| "reward_std": 0.3977198153734207, | |
| "rewards/cosine_scaled_reward": 0.1398820113390684, | |
| "rewards/format_reward": 0.6666666679084301, | |
| "step": 464 | |
| }, | |
| { | |
| "completion_length": 2949.5833435058594, | |
| "epoch": 0.5314285714285715, | |
| "grad_norm": 0.08343047648668289, | |
| "kl": 2.3955944925546646e-05, | |
| "learning_rate": 1.1413757749211602e-07, | |
| "loss": 0.0, | |
| "reward": 0.15869783610105515, | |
| "reward_std": 0.9265065267682076, | |
| "rewards/cosine_scaled_reward": -0.08731775241903961, | |
| "rewards/format_reward": 0.3333333395421505, | |
| "step": 465 | |
| }, | |
| { | |
| "completion_length": 2935.5625076293945, | |
| "epoch": 0.5325714285714286, | |
| "grad_norm": 0.06667914241552353, | |
| "kl": 2.3618340492248535e-05, | |
| "learning_rate": 1.1336692317580158e-07, | |
| "loss": 0.0, | |
| "reward": 0.1596589144319296, | |
| "reward_std": 0.359066603705287, | |
| "rewards/cosine_scaled_reward": -0.03475389443337917, | |
| "rewards/format_reward": 0.2291666716337204, | |
| "step": 466 | |
| }, | |
| { | |
| "completion_length": 3074.854179382324, | |
| "epoch": 0.5337142857142857, | |
| "grad_norm": 0.08065426349639893, | |
| "kl": 5.206838250160217e-05, | |
| "learning_rate": 1.1261754973965422e-07, | |
| "loss": 0.0, | |
| "reward": -0.2103513814508915, | |
| "reward_std": 0.45726192370057106, | |
| "rewards/cosine_scaled_reward": -0.20934236235916615, | |
| "rewards/format_reward": 0.20833333395421505, | |
| "step": 467 | |
| }, | |
| { | |
| "completion_length": 2888.541679382324, | |
| "epoch": 0.5348571428571428, | |
| "grad_norm": 0.082003153860569, | |
| "kl": 4.050601273775101e-05, | |
| "learning_rate": 1.1188949370707787e-07, | |
| "loss": 0.0, | |
| "reward": -0.20459073898382485, | |
| "reward_std": 0.3391787763684988, | |
| "rewards/cosine_scaled_reward": -0.2585453763604164, | |
| "rewards/format_reward": 0.3125000074505806, | |
| "step": 468 | |
| }, | |
| { | |
| "completion_length": 2827.5416679382324, | |
| "epoch": 0.536, | |
| "grad_norm": 0.0835421085357666, | |
| "kl": 1.248624175786972e-05, | |
| "learning_rate": 1.1118279056249653e-07, | |
| "loss": 0.0, | |
| "reward": -0.0638388879597187, | |
| "reward_std": 0.38233031099662185, | |
| "rewards/cosine_scaled_reward": -0.17775276489555836, | |
| "rewards/format_reward": 0.29166666977107525, | |
| "step": 469 | |
| }, | |
| { | |
| "completion_length": 2983.7291870117188, | |
| "epoch": 0.5371428571428571, | |
| "grad_norm": 0.07171688973903656, | |
| "kl": 4.877336323261261e-05, | |
| "learning_rate": 1.1049747474962444e-07, | |
| "loss": 0.0, | |
| "reward": -0.05503089912235737, | |
| "reward_std": 0.6313158124685287, | |
| "rewards/cosine_scaled_reward": -0.17334878351539373, | |
| "rewards/format_reward": 0.2916666753590107, | |
| "step": 470 | |
| }, | |
| { | |
| "completion_length": 3055.1666870117188, | |
| "epoch": 0.5382857142857143, | |
| "grad_norm": 0.058664120733737946, | |
| "kl": 1.4587771147489548e-05, | |
| "learning_rate": 1.0983357966978745e-07, | |
| "loss": 0.0, | |
| "reward": -0.007740400731563568, | |
| "reward_std": 0.41847239434719086, | |
| "rewards/cosine_scaled_reward": -0.12887020222842693, | |
| "rewards/format_reward": 0.25, | |
| "step": 471 | |
| }, | |
| { | |
| "completion_length": 2884.937530517578, | |
| "epoch": 0.5394285714285715, | |
| "grad_norm": 0.06170263886451721, | |
| "kl": 7.81528651714325e-05, | |
| "learning_rate": 1.0919113768029517e-07, | |
| "loss": 0.0, | |
| "reward": 0.11590329185128212, | |
| "reward_std": 0.743870422244072, | |
| "rewards/cosine_scaled_reward": -0.11913168523460627, | |
| "rewards/format_reward": 0.35416666977107525, | |
| "step": 472 | |
| }, | |
| { | |
| "completion_length": 3235.500030517578, | |
| "epoch": 0.5405714285714286, | |
| "grad_norm": 0.06060759350657463, | |
| "kl": 3.5218894481658936e-05, | |
| "learning_rate": 1.0857018009286381e-07, | |
| "loss": 0.0, | |
| "reward": -0.03435097075998783, | |
| "reward_std": 0.7473096624016762, | |
| "rewards/cosine_scaled_reward": -0.13175882119685411, | |
| "rewards/format_reward": 0.22916666977107525, | |
| "step": 473 | |
| }, | |
| { | |
| "completion_length": 2471.770866394043, | |
| "epoch": 0.5417142857142857, | |
| "grad_norm": 0.08800923079252243, | |
| "kl": 2.9017683118581772e-05, | |
| "learning_rate": 1.0797073717209013e-07, | |
| "loss": 0.0, | |
| "reward": 0.9437330272048712, | |
| "reward_std": 0.7430768720805645, | |
| "rewards/cosine_scaled_reward": 0.211449827067554, | |
| "rewards/format_reward": 0.5208333376795053, | |
| "step": 474 | |
| }, | |
| { | |
| "completion_length": 2501.2500228881836, | |
| "epoch": 0.5428571428571428, | |
| "grad_norm": 0.08744020760059357, | |
| "kl": 4.112720489501953e-05, | |
| "learning_rate": 1.0739283813397639e-07, | |
| "loss": 0.0, | |
| "reward": 0.7349114557728171, | |
| "reward_std": 0.9988465346395969, | |
| "rewards/cosine_scaled_reward": 0.07578905718401074, | |
| "rewards/format_reward": 0.5833333395421505, | |
| "step": 475 | |
| }, | |
| { | |
| "completion_length": 2667.125045776367, | |
| "epoch": 0.544, | |
| "grad_norm": 0.08761317282915115, | |
| "kl": 4.673376679420471e-05, | |
| "learning_rate": 1.068365111445064e-07, | |
| "loss": 0.0, | |
| "reward": 0.535813775844872, | |
| "reward_std": 0.9447889849543571, | |
| "rewards/cosine_scaled_reward": 0.007490205112844706, | |
| "rewards/format_reward": 0.5208333376795053, | |
| "step": 476 | |
| }, | |
| { | |
| "completion_length": 2051.145881652832, | |
| "epoch": 0.5451428571428572, | |
| "grad_norm": 0.10010962188243866, | |
| "kl": 3.673136234283447e-05, | |
| "learning_rate": 1.063017833182728e-07, | |
| "loss": 0.0, | |
| "reward": 1.1608257871121168, | |
| "reward_std": 0.8816641084849834, | |
| "rewards/cosine_scaled_reward": 0.20541287446394563, | |
| "rewards/format_reward": 0.7500000074505806, | |
| "step": 477 | |
| }, | |
| { | |
| "completion_length": 3102.8125, | |
| "epoch": 0.5462857142857143, | |
| "grad_norm": 0.07919441163539886, | |
| "kl": 1.853145658969879e-05, | |
| "learning_rate": 1.0578868071715544e-07, | |
| "loss": 0.0, | |
| "reward": 0.26362159801647067, | |
| "reward_std": 0.7521736957132816, | |
| "rewards/cosine_scaled_reward": -0.066105873323977, | |
| "rewards/format_reward": 0.3958333395421505, | |
| "step": 478 | |
| }, | |
| { | |
| "completion_length": 2834.7708435058594, | |
| "epoch": 0.5474285714285714, | |
| "grad_norm": 0.08955971151590347, | |
| "kl": 3.330502659082413e-05, | |
| "learning_rate": 1.0529722834905125e-07, | |
| "loss": 0.0, | |
| "reward": 0.02914455719292164, | |
| "reward_std": 0.6620577126741409, | |
| "rewards/cosine_scaled_reward": -0.16251106595154852, | |
| "rewards/format_reward": 0.35416666977107525, | |
| "step": 479 | |
| }, | |
| { | |
| "completion_length": 2238.520851135254, | |
| "epoch": 0.5485714285714286, | |
| "grad_norm": 0.10024753957986832, | |
| "kl": 6.082782056182623e-05, | |
| "learning_rate": 1.0482745016665526e-07, | |
| "loss": 0.0, | |
| "reward": 0.4133927784860134, | |
| "reward_std": 0.7841507941484451, | |
| "rewards/cosine_scaled_reward": -0.07455361541360617, | |
| "rewards/format_reward": 0.5625000018626451, | |
| "step": 480 | |
| }, | |
| { | |
| "completion_length": 3055.3750762939453, | |
| "epoch": 0.5497142857142857, | |
| "grad_norm": 0.06506208330392838, | |
| "kl": 2.900976687669754e-05, | |
| "learning_rate": 1.0437936906629334e-07, | |
| "loss": 0.0, | |
| "reward": 0.014262760989367962, | |
| "reward_std": 0.6414206568151712, | |
| "rewards/cosine_scaled_reward": -0.18036862555891275, | |
| "rewards/format_reward": 0.37500000558793545, | |
| "step": 481 | |
| }, | |
| { | |
| "completion_length": 2815.666679382324, | |
| "epoch": 0.5508571428571428, | |
| "grad_norm": 0.08013670146465302, | |
| "kl": 4.282337613403797e-05, | |
| "learning_rate": 1.0395300688680625e-07, | |
| "loss": 0.0, | |
| "reward": 0.4762246310710907, | |
| "reward_std": 0.9046560376882553, | |
| "rewards/cosine_scaled_reward": 0.07144563179463148, | |
| "rewards/format_reward": 0.3333333358168602, | |
| "step": 482 | |
| }, | |
| { | |
| "completion_length": 2774.1042098999023, | |
| "epoch": 0.552, | |
| "grad_norm": 0.06167498975992203, | |
| "kl": 2.088397741317749e-05, | |
| "learning_rate": 1.0354838440848501e-07, | |
| "loss": 0.0, | |
| "reward": 0.2984310518950224, | |
| "reward_std": 0.46697283908724785, | |
| "rewards/cosine_scaled_reward": -0.11120116710662842, | |
| "rewards/format_reward": 0.520833333954215, | |
| "step": 483 | |
| }, | |
| { | |
| "completion_length": 2558.270854949951, | |
| "epoch": 0.5531428571428572, | |
| "grad_norm": 0.07994337379932404, | |
| "kl": 5.245208740234375e-06, | |
| "learning_rate": 1.0316552135205837e-07, | |
| "loss": 0.0, | |
| "reward": 0.33059572614729404, | |
| "reward_std": 0.8539671897888184, | |
| "rewards/cosine_scaled_reward": -0.053452130407094955, | |
| "rewards/format_reward": 0.43750000558793545, | |
| "step": 484 | |
| }, | |
| { | |
| "completion_length": 2086.5208740234375, | |
| "epoch": 0.5542857142857143, | |
| "grad_norm": 0.10933925211429596, | |
| "kl": 8.203089237213135e-05, | |
| "learning_rate": 1.0280443637773163e-07, | |
| "loss": 0.0, | |
| "reward": 0.28971124812960625, | |
| "reward_std": 0.630750261247158, | |
| "rewards/cosine_scaled_reward": -0.18847772106528282, | |
| "rewards/format_reward": 0.6666666716337204, | |
| "step": 485 | |
| }, | |
| { | |
| "completion_length": 1841.1875457763672, | |
| "epoch": 0.5554285714285714, | |
| "grad_norm": 0.10996180027723312, | |
| "kl": 6.404519081115723e-05, | |
| "learning_rate": 1.0246514708427701e-07, | |
| "loss": 0.0, | |
| "reward": 0.5313799711875618, | |
| "reward_std": 0.5339483916759491, | |
| "rewards/cosine_scaled_reward": -0.05722668580710888, | |
| "rewards/format_reward": 0.6458333395421505, | |
| "step": 486 | |
| }, | |
| { | |
| "completion_length": 1869.8750267028809, | |
| "epoch": 0.5565714285714286, | |
| "grad_norm": 0.1162729561328888, | |
| "kl": 1.4529097825288773e-05, | |
| "learning_rate": 1.0214767000817596e-07, | |
| "loss": 0.0, | |
| "reward": 0.9877903237938881, | |
| "reward_std": 0.7925257999449968, | |
| "rewards/cosine_scaled_reward": 0.16056180885061622, | |
| "rewards/format_reward": 0.6666666716337204, | |
| "step": 487 | |
| }, | |
| { | |
| "completion_length": 2408.1041984558105, | |
| "epoch": 0.5577142857142857, | |
| "grad_norm": 0.08437583595514297, | |
| "kl": 2.6656314730644226e-05, | |
| "learning_rate": 1.0185202062281336e-07, | |
| "loss": 0.0, | |
| "reward": 0.24246202781796455, | |
| "reward_std": 0.550208680331707, | |
| "rewards/cosine_scaled_reward": -0.11835231864824891, | |
| "rewards/format_reward": 0.4791666679084301, | |
| "step": 488 | |
| }, | |
| { | |
| "completion_length": 3094.750011444092, | |
| "epoch": 0.5588571428571428, | |
| "grad_norm": 0.09759029746055603, | |
| "kl": 6.0001155361533165e-05, | |
| "learning_rate": 1.0157821333772304e-07, | |
| "loss": 0.0, | |
| "reward": -0.3092441540211439, | |
| "reward_std": 0.4272337146103382, | |
| "rewards/cosine_scaled_reward": -0.24837207747623324, | |
| "rewards/format_reward": 0.1875, | |
| "step": 489 | |
| }, | |
| { | |
| "completion_length": 2849.7916870117188, | |
| "epoch": 0.56, | |
| "grad_norm": 0.0604851134121418, | |
| "kl": 1.6011297702789307e-05, | |
| "learning_rate": 1.013262614978859e-07, | |
| "loss": 0.0, | |
| "reward": 0.4163107567001134, | |
| "reward_std": 0.8956920951604843, | |
| "rewards/cosine_scaled_reward": -0.02101128175854683, | |
| "rewards/format_reward": 0.45833333767950535, | |
| "step": 490 | |
| }, | |
| { | |
| "completion_length": 2636.541702270508, | |
| "epoch": 0.5611428571428572, | |
| "grad_norm": 0.07633573561906815, | |
| "kl": 3.331899642944336e-05, | |
| "learning_rate": 1.0109617738307911e-07, | |
| "loss": 0.0, | |
| "reward": 0.5356120392680168, | |
| "reward_std": 0.688828831538558, | |
| "rewards/cosine_scaled_reward": 0.028222685679793358, | |
| "rewards/format_reward": 0.4791666753590107, | |
| "step": 491 | |
| }, | |
| { | |
| "completion_length": 2699.645835876465, | |
| "epoch": 0.5622857142857143, | |
| "grad_norm": 0.09107892215251923, | |
| "kl": 5.65163791179657e-05, | |
| "learning_rate": 1.0088797220727779e-07, | |
| "loss": 0.0, | |
| "reward": -0.10418308898806572, | |
| "reward_std": 0.4072574134916067, | |
| "rewards/cosine_scaled_reward": -0.19792487937957048, | |
| "rewards/format_reward": 0.2916666679084301, | |
| "step": 492 | |
| }, | |
| { | |
| "completion_length": 2190.0000534057617, | |
| "epoch": 0.5634285714285714, | |
| "grad_norm": 0.08895355463027954, | |
| "kl": 2.7106492780148983e-05, | |
| "learning_rate": 1.0070165611810855e-07, | |
| "loss": 0.0, | |
| "reward": 0.7456803433597088, | |
| "reward_std": 0.5171892158687115, | |
| "rewards/cosine_scaled_reward": 0.08117351215332747, | |
| "rewards/format_reward": 0.5833333358168602, | |
| "step": 493 | |
| }, | |
| { | |
| "completion_length": 2136.5625381469727, | |
| "epoch": 0.5645714285714286, | |
| "grad_norm": 0.09751053899526596, | |
| "kl": 3.5608187317848206e-05, | |
| "learning_rate": 1.005372381963547e-07, | |
| "loss": 0.0, | |
| "reward": 0.7730939202010632, | |
| "reward_std": 1.0281653888523579, | |
| "rewards/cosine_scaled_reward": 0.03238028334453702, | |
| "rewards/format_reward": 0.7083333544433117, | |
| "step": 494 | |
| }, | |
| { | |
| "completion_length": 3093.2291870117188, | |
| "epoch": 0.5657142857142857, | |
| "grad_norm": 0.058134015649557114, | |
| "kl": 6.802193820476532e-05, | |
| "learning_rate": 1.0039472645551372e-07, | |
| "loss": 0.0, | |
| "reward": 0.14028028398752213, | |
| "reward_std": 0.824892794713378, | |
| "rewards/cosine_scaled_reward": -0.09652652451768517, | |
| "rewards/format_reward": 0.3333333395421505, | |
| "step": 495 | |
| }, | |
| { | |
| "completion_length": 1778.2500343322754, | |
| "epoch": 0.5668571428571428, | |
| "grad_norm": 0.11538412421941757, | |
| "kl": 6.56619668006897e-05, | |
| "learning_rate": 1.002741278414069e-07, | |
| "loss": 0.0, | |
| "reward": 0.9300711955875158, | |
| "reward_std": 0.6772461663931608, | |
| "rewards/cosine_scaled_reward": 0.11086891777813435, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 496 | |
| }, | |
| { | |
| "completion_length": 2530.6042137145996, | |
| "epoch": 0.568, | |
| "grad_norm": 0.08302483707666397, | |
| "kl": 4.589557647705078e-05, | |
| "learning_rate": 1.0017544823184055e-07, | |
| "loss": 0.0, | |
| "reward": 0.9374174466356635, | |
| "reward_std": 0.7146439161151648, | |
| "rewards/cosine_scaled_reward": 0.22912537679076195, | |
| "rewards/format_reward": 0.4791666753590107, | |
| "step": 497 | |
| }, | |
| { | |
| "completion_length": 2849.1458587646484, | |
| "epoch": 0.5691428571428572, | |
| "grad_norm": 0.06491124629974365, | |
| "kl": 3.0152499675750732e-05, | |
| "learning_rate": 1.0009869243631952e-07, | |
| "loss": 0.0, | |
| "reward": 0.1876700147986412, | |
| "reward_std": 0.6750749666243792, | |
| "rewards/cosine_scaled_reward": -0.1249149963259697, | |
| "rewards/format_reward": 0.43750000558793545, | |
| "step": 498 | |
| }, | |
| { | |
| "completion_length": 2447.2916870117188, | |
| "epoch": 0.5702857142857143, | |
| "grad_norm": 0.07432974874973297, | |
| "kl": 3.886595368385315e-05, | |
| "learning_rate": 1.000438641958131e-07, | |
| "loss": 0.0, | |
| "reward": 0.5329095907509327, | |
| "reward_std": 1.062804576009512, | |
| "rewards/cosine_scaled_reward": -0.025211881089489907, | |
| "rewards/format_reward": 0.5833333395421505, | |
| "step": 499 | |
| }, | |
| { | |
| "completion_length": 2930.9791870117188, | |
| "epoch": 0.5714285714285714, | |
| "grad_norm": 0.07764609158039093, | |
| "kl": 4.096329212188721e-05, | |
| "learning_rate": 1.0001096618257236e-07, | |
| "loss": 0.0, | |
| "reward": 0.04181864787824452, | |
| "reward_std": 0.7614006511867046, | |
| "rewards/cosine_scaled_reward": -0.12492401897907257, | |
| "rewards/format_reward": 0.29166666977107525, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.5714285714285714, | |
| "step": 500, | |
| "total_flos": 0.0, | |
| "train_loss": 1.4284696741739821e-06, | |
| "train_runtime": 164217.1359, | |
| "train_samples_per_second": 0.146, | |
| "train_steps_per_second": 0.003 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 500, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 6, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |