| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.22857142857142856, |
| "eval_steps": 500, |
| "global_step": 200, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2523.270866394043, |
| "epoch": 0.001142857142857143, |
| "grad_norm": 0.0744374468922615, |
| "kl": 0.0, |
| "learning_rate": 0.0, |
| "loss": 0.0007, |
| "reward": 0.17862090840935707, |
| "reward_std": 0.539480353705585, |
| "rewards/accuracy_reward": 0.2500000037252903, |
| "rewards/cosine_scaled_reward": -0.0713790925219655, |
| "step": 1 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2684.583366394043, |
| "epoch": 0.002285714285714286, |
| "grad_norm": 0.07803362607955933, |
| "kl": 0.0, |
| "learning_rate": 1e-07, |
| "loss": 0.0338, |
| "reward": 0.33918463438749313, |
| "reward_std": 0.4111455399543047, |
| "rewards/accuracy_reward": 0.29166667349636555, |
| "rewards/cosine_scaled_reward": 0.047517990693449974, |
| "step": 2 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2981.3541717529297, |
| "epoch": 0.0034285714285714284, |
| "grad_norm": 0.07460929453372955, |
| "kl": 5.2601099014282227e-05, |
| "learning_rate": 2e-07, |
| "loss": -0.0527, |
| "reward": 0.0897666085511446, |
| "reward_std": 0.43747894931584597, |
| "rewards/accuracy_reward": 0.10416666977107525, |
| "rewards/cosine_scaled_reward": -0.014400066807866096, |
| "step": 3 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1419.8333740234375, |
| "epoch": 0.004571428571428572, |
| "grad_norm": 0.11164188385009766, |
| "kl": 4.7594308853149414e-05, |
| "learning_rate": 3e-07, |
| "loss": -0.0508, |
| "reward": 0.09919259510934353, |
| "reward_std": 0.6438650283962488, |
| "rewards/accuracy_reward": 0.1666666716337204, |
| "rewards/cosine_scaled_reward": -0.06747407512739301, |
| "step": 4 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3174.0625228881836, |
| "epoch": 0.005714285714285714, |
| "grad_norm": 0.06975400447845459, |
| "kl": 5.8650970458984375e-05, |
| "learning_rate": 4e-07, |
| "loss": 0.0271, |
| "reward": -0.15678282314911485, |
| "reward_std": 0.33761502243578434, |
| "rewards/accuracy_reward": 0.0416666679084301, |
| "rewards/cosine_scaled_reward": -0.19844949309481308, |
| "step": 5 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2732.250045776367, |
| "epoch": 0.006857142857142857, |
| "grad_norm": 0.07184051722288132, |
| "kl": 4.495866596698761e-05, |
| "learning_rate": 5e-07, |
| "loss": -0.0141, |
| "reward": -0.13207554817199707, |
| "reward_std": 0.2536952579393983, |
| "rewards/accuracy_reward": 0.0416666679084301, |
| "rewards/cosine_scaled_reward": -0.1737422114238143, |
| "step": 6 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2609.5208740234375, |
| "epoch": 0.008, |
| "grad_norm": 0.06629231572151184, |
| "kl": 3.341585397720337e-05, |
| "learning_rate": 6e-07, |
| "loss": 0.0435, |
| "reward": 0.09007547050714493, |
| "reward_std": 0.3771515293046832, |
| "rewards/accuracy_reward": 0.16666667349636555, |
| "rewards/cosine_scaled_reward": -0.07659117877483368, |
| "step": 7 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2417.1458892822266, |
| "epoch": 0.009142857142857144, |
| "grad_norm": 0.07080549746751785, |
| "kl": 3.287196159362793e-05, |
| "learning_rate": 7e-07, |
| "loss": -0.0181, |
| "reward": 0.4757417570799589, |
| "reward_std": 0.6338865607976913, |
| "rewards/accuracy_reward": 0.3541666753590107, |
| "rewards/cosine_scaled_reward": 0.12157511284749489, |
| "step": 8 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2861.041702270508, |
| "epoch": 0.010285714285714285, |
| "grad_norm": 0.1069210022687912, |
| "kl": 4.7460198402404785e-05, |
| "learning_rate": 8e-07, |
| "loss": 0.0098, |
| "reward": -0.021934514865279198, |
| "reward_std": 0.4319152287207544, |
| "rewards/accuracy_reward": 0.1041666679084301, |
| "rewards/cosine_scaled_reward": -0.1261011796505045, |
| "step": 9 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2603.104179382324, |
| "epoch": 0.011428571428571429, |
| "grad_norm": 0.08398132026195526, |
| "kl": 5.1975250244140625e-05, |
| "learning_rate": 9e-07, |
| "loss": -0.0231, |
| "reward": 0.13638958521187305, |
| "reward_std": 0.5910755675286055, |
| "rewards/accuracy_reward": 0.18750000558793545, |
| "rewards/cosine_scaled_reward": -0.051110414788126945, |
| "step": 10 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3321.750030517578, |
| "epoch": 0.012571428571428572, |
| "grad_norm": 0.04589095339179039, |
| "kl": 4.659593105316162e-05, |
| "learning_rate": 1e-06, |
| "loss": 0.0195, |
| "reward": -0.061404408887028694, |
| "reward_std": 0.3960698740556836, |
| "rewards/accuracy_reward": 0.10416666977107525, |
| "rewards/cosine_scaled_reward": -0.16557107982225716, |
| "step": 11 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2037.8750457763672, |
| "epoch": 0.013714285714285714, |
| "grad_norm": 0.0902877002954483, |
| "kl": 4.0978193283081055e-05, |
| "learning_rate": 9.997258721585931e-07, |
| "loss": 0.0965, |
| "reward": 0.18073058780282736, |
| "reward_std": 0.5450206631794572, |
| "rewards/accuracy_reward": 0.1875000074505806, |
| "rewards/cosine_scaled_reward": -0.006769413128495216, |
| "step": 12 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2972.312515258789, |
| "epoch": 0.014857142857142857, |
| "grad_norm": 0.05695594474673271, |
| "kl": 3.445148468017578e-05, |
| "learning_rate": 9.989038226169207e-07, |
| "loss": 0.0496, |
| "reward": 0.18564651999622583, |
| "reward_std": 0.5214524045586586, |
| "rewards/accuracy_reward": 0.2291666753590107, |
| "rewards/cosine_scaled_reward": -0.04352014325559139, |
| "step": 13 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2374.9375228881836, |
| "epoch": 0.016, |
| "grad_norm": 0.0727272629737854, |
| "kl": 3.678351640701294e-05, |
| "learning_rate": 9.975348529157229e-07, |
| "loss": 0.0125, |
| "reward": 0.08268034365028143, |
| "reward_std": 0.5252711391076446, |
| "rewards/accuracy_reward": 0.1666666716337204, |
| "rewards/cosine_scaled_reward": -0.08398633264005184, |
| "step": 14 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2724.062515258789, |
| "epoch": 0.017142857142857144, |
| "grad_norm": 0.07843177765607834, |
| "kl": 3.783963620662689e-05, |
| "learning_rate": 9.956206309337066e-07, |
| "loss": -0.0409, |
| "reward": 0.2568634729832411, |
| "reward_std": 0.3660598713904619, |
| "rewards/accuracy_reward": 0.2500000037252903, |
| "rewards/cosine_scaled_reward": 0.006863469257950783, |
| "step": 15 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3526.7083435058594, |
| "epoch": 0.018285714285714287, |
| "grad_norm": 0.05552014708518982, |
| "kl": 5.0380825996398926e-05, |
| "learning_rate": 9.931634888554935e-07, |
| "loss": -0.0203, |
| "reward": -0.260313069447875, |
| "reward_std": 0.15844993200153112, |
| "rewards/accuracy_reward": 0.0, |
| "rewards/cosine_scaled_reward": -0.26031307131052017, |
| "step": 16 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2162.083366394043, |
| "epoch": 0.019428571428571427, |
| "grad_norm": 0.130440354347229, |
| "kl": 5.4463744163513184e-05, |
| "learning_rate": 9.901664203302124e-07, |
| "loss": 0.0201, |
| "reward": 0.16820077877491713, |
| "reward_std": 0.6183132668957114, |
| "rewards/accuracy_reward": 0.2291666753590107, |
| "rewards/cosine_scaled_reward": -0.0609658882021904, |
| "step": 17 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2849.8750534057617, |
| "epoch": 0.02057142857142857, |
| "grad_norm": 0.05553295090794563, |
| "kl": 3.0259601771831512e-05, |
| "learning_rate": 9.866330768241983e-07, |
| "loss": 0.0625, |
| "reward": 0.28691262751817703, |
| "reward_std": 0.5530234389007092, |
| "rewards/accuracy_reward": 0.2708333358168602, |
| "rewards/cosine_scaled_reward": 0.0160792883252725, |
| "step": 18 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2852.1250534057617, |
| "epoch": 0.021714285714285714, |
| "grad_norm": 0.069038525223732, |
| "kl": 3.6716461181640625e-05, |
| "learning_rate": 9.825677631722435e-07, |
| "loss": 0.0466, |
| "reward": 0.41487319313455373, |
| "reward_std": 0.5478954035788774, |
| "rewards/accuracy_reward": 0.3541666753590107, |
| "rewards/cosine_scaled_reward": 0.060706520453095436, |
| "step": 19 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1917.3125457763672, |
| "epoch": 0.022857142857142857, |
| "grad_norm": 0.09311036020517349, |
| "kl": 3.442913293838501e-05, |
| "learning_rate": 9.779754323328192e-07, |
| "loss": 0.0638, |
| "reward": 0.34980504028499126, |
| "reward_std": 0.5843578286003321, |
| "rewards/accuracy_reward": 0.31250000186264515, |
| "rewards/cosine_scaled_reward": 0.037305014207959175, |
| "step": 20 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2692.729232788086, |
| "epoch": 0.024, |
| "grad_norm": 0.10610220581293106, |
| "kl": 4.506111145019531e-05, |
| "learning_rate": 9.728616793536587e-07, |
| "loss": 0.0482, |
| "reward": 0.3050496280193329, |
| "reward_std": 0.5608191061764956, |
| "rewards/accuracy_reward": 0.2916666753590107, |
| "rewards/cosine_scaled_reward": 0.013382963836193085, |
| "step": 21 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1465.395866394043, |
| "epoch": 0.025142857142857144, |
| "grad_norm": 0.12254074215888977, |
| "kl": 3.9443373680114746e-05, |
| "learning_rate": 9.672327345550543e-07, |
| "loss": -0.0519, |
| "reward": 0.3392514977604151, |
| "reward_std": 0.30786877777427435, |
| "rewards/accuracy_reward": 0.3333333432674408, |
| "rewards/cosine_scaled_reward": 0.0059181563556194305, |
| "step": 22 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2394.937530517578, |
| "epoch": 0.026285714285714287, |
| "grad_norm": 0.10998130589723587, |
| "kl": 3.953278064727783e-05, |
| "learning_rate": 9.610954559391704e-07, |
| "loss": 0.0996, |
| "reward": 0.14149342849850655, |
| "reward_std": 0.34754151944071054, |
| "rewards/accuracy_reward": 0.20833333395421505, |
| "rewards/cosine_scaled_reward": -0.0668399203568697, |
| "step": 23 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2533.875045776367, |
| "epoch": 0.027428571428571427, |
| "grad_norm": 0.08649002015590668, |
| "kl": 3.2901763916015625e-05, |
| "learning_rate": 9.54457320834625e-07, |
| "loss": 0.0447, |
| "reward": 0.5146787296980619, |
| "reward_std": 0.5441905837506056, |
| "rewards/accuracy_reward": 0.39583334140479565, |
| "rewards/cosine_scaled_reward": 0.118845384567976, |
| "step": 24 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2268.791679382324, |
| "epoch": 0.02857142857142857, |
| "grad_norm": 0.08303514868021011, |
| "kl": 4.312890814617276e-05, |
| "learning_rate": 9.473264167865171e-07, |
| "loss": 0.0186, |
| "reward": -0.08262800239026546, |
| "reward_std": 0.36995193734765053, |
| "rewards/accuracy_reward": 0.0833333358168602, |
| "rewards/cosine_scaled_reward": -0.1659613372758031, |
| "step": 25 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2869.7500228881836, |
| "epoch": 0.029714285714285714, |
| "grad_norm": 0.061481326818466187, |
| "kl": 5.170702934265137e-05, |
| "learning_rate": 9.397114317029974e-07, |
| "loss": -0.0101, |
| "reward": 0.006060175597667694, |
| "reward_std": 0.3566547529771924, |
| "rewards/accuracy_reward": 0.12500000558793545, |
| "rewards/cosine_scaled_reward": -0.11893982626497746, |
| "step": 26 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2821.687530517578, |
| "epoch": 0.030857142857142857, |
| "grad_norm": 0.08310137689113617, |
| "kl": 6.3285231590271e-05, |
| "learning_rate": 9.316216432703916e-07, |
| "loss": -0.004, |
| "reward": -0.13682544324547052, |
| "reward_std": 0.38389212638139725, |
| "rewards/accuracy_reward": 0.06250000186264515, |
| "rewards/cosine_scaled_reward": -0.19932544289622456, |
| "step": 27 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2828.333366394043, |
| "epoch": 0.032, |
| "grad_norm": 0.07233916968107224, |
| "kl": 4.972517490386963e-05, |
| "learning_rate": 9.230669076497687e-07, |
| "loss": 0.0209, |
| "reward": 0.25604306906461716, |
| "reward_std": 0.5071917362511158, |
| "rewards/accuracy_reward": 0.27083333767950535, |
| "rewards/cosine_scaled_reward": -0.014790281420573592, |
| "step": 28 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3092.6458587646484, |
| "epoch": 0.03314285714285714, |
| "grad_norm": 0.07088705897331238, |
| "kl": 4.079937934875488e-05, |
| "learning_rate": 9.140576474687263e-07, |
| "loss": -0.0988, |
| "reward": -0.12730359099805355, |
| "reward_std": 0.25404997263103724, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/cosine_scaled_reward": -0.18980359099805355, |
| "step": 29 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2785.1458740234375, |
| "epoch": 0.03428571428571429, |
| "grad_norm": 0.0721890851855278, |
| "kl": 4.616379737854004e-05, |
| "learning_rate": 9.046048391230247e-07, |
| "loss": 0.0538, |
| "reward": 0.26715128123760223, |
| "reward_std": 0.5637835282832384, |
| "rewards/accuracy_reward": 0.22916667349636555, |
| "rewards/cosine_scaled_reward": 0.0379846110008657, |
| "step": 30 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3028.0416679382324, |
| "epoch": 0.03542857142857143, |
| "grad_norm": 0.07398995757102966, |
| "kl": 4.813075065612793e-05, |
| "learning_rate": 8.9471999940354e-07, |
| "loss": 0.0307, |
| "reward": -0.1556811612099409, |
| "reward_std": 0.3613749761134386, |
| "rewards/accuracy_reward": 0.06250000186264515, |
| "rewards/cosine_scaled_reward": -0.21818116027861834, |
| "step": 31 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2944.3958740234375, |
| "epoch": 0.036571428571428574, |
| "grad_norm": 0.06509231775999069, |
| "kl": 5.62518835067749e-05, |
| "learning_rate": 8.844151714648274e-07, |
| "loss": 0.0698, |
| "reward": 0.305016117868945, |
| "reward_std": 0.3333305884152651, |
| "rewards/accuracy_reward": 0.2916666679084301, |
| "rewards/cosine_scaled_reward": 0.013349458575248718, |
| "step": 32 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3250.5208587646484, |
| "epoch": 0.037714285714285714, |
| "grad_norm": 0.061484675854444504, |
| "kl": 5.4955482482910156e-05, |
| "learning_rate": 8.737029101523929e-07, |
| "loss": 0.025, |
| "reward": 0.008718075230717659, |
| "reward_std": 0.5328585915267467, |
| "rewards/accuracy_reward": 0.1458333358168602, |
| "rewards/cosine_scaled_reward": -0.1371152652427554, |
| "step": 33 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2223.6666870117188, |
| "epoch": 0.038857142857142854, |
| "grad_norm": 0.08283665031194687, |
| "kl": 4.7266483306884766e-05, |
| "learning_rate": 8.625962667065487e-07, |
| "loss": 0.0628, |
| "reward": 0.5019577480852604, |
| "reward_std": 0.6730294618755579, |
| "rewards/accuracy_reward": 0.3750000111758709, |
| "rewards/cosine_scaled_reward": 0.1269577438943088, |
| "step": 34 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3116.520896911621, |
| "epoch": 0.04, |
| "grad_norm": 0.10011623799800873, |
| "kl": 6.172060966491699e-05, |
| "learning_rate": 8.511087728614862e-07, |
| "loss": -0.0185, |
| "reward": -0.1340523180551827, |
| "reward_std": 0.38020466081798077, |
| "rewards/accuracy_reward": 0.06250000186264515, |
| "rewards/cosine_scaled_reward": -0.1965523138642311, |
| "step": 35 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3130.75, |
| "epoch": 0.04114285714285714, |
| "grad_norm": 0.06970471888780594, |
| "kl": 5.842745304107666e-05, |
| "learning_rate": 8.392544243589427e-07, |
| "loss": 0.0422, |
| "reward": -0.17425783909857273, |
| "reward_std": 0.3371034972369671, |
| "rewards/accuracy_reward": 0.0416666679084301, |
| "rewards/cosine_scaled_reward": -0.21592450886964798, |
| "step": 36 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3406.125, |
| "epoch": 0.04228571428571429, |
| "grad_norm": 0.0568542517721653, |
| "kl": 4.285573959350586e-05, |
| "learning_rate": 8.270476638965461e-07, |
| "loss": 0.0194, |
| "reward": -0.044786570593714714, |
| "reward_std": 0.24069493543356657, |
| "rewards/accuracy_reward": 0.0833333358168602, |
| "rewards/cosine_scaled_reward": -0.12811991199851036, |
| "step": 37 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3252.375015258789, |
| "epoch": 0.04342857142857143, |
| "grad_norm": 0.053038232028484344, |
| "kl": 5.0187110900878906e-05, |
| "learning_rate": 8.145033635316128e-07, |
| "loss": -0.0011, |
| "reward": 0.00859471783041954, |
| "reward_std": 0.3808100689202547, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/cosine_scaled_reward": -0.1164052952080965, |
| "step": 38 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2861.833335876465, |
| "epoch": 0.044571428571428574, |
| "grad_norm": 0.06544026732444763, |
| "kl": 3.574788570404053e-05, |
| "learning_rate": 8.01636806561836e-07, |
| "loss": 0.0258, |
| "reward": 0.2788538106251508, |
| "reward_std": 0.21739591227378696, |
| "rewards/accuracy_reward": 0.27083333395421505, |
| "rewards/cosine_scaled_reward": 0.008020471781492233, |
| "step": 39 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2407.020896911621, |
| "epoch": 0.045714285714285714, |
| "grad_norm": 0.07022753357887268, |
| "kl": 3.733113408088684e-05, |
| "learning_rate": 7.884636689049422e-07, |
| "loss": -0.0064, |
| "reward": 0.11176938330754638, |
| "reward_std": 0.3469166085124016, |
| "rewards/accuracy_reward": 0.1666666679084301, |
| "rewards/cosine_scaled_reward": -0.05489729158580303, |
| "step": 40 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3272.0625915527344, |
| "epoch": 0.046857142857142854, |
| "grad_norm": 0.04923461005091667, |
| "kl": 4.369020462036133e-05, |
| "learning_rate": 7.75e-07, |
| "loss": -0.0115, |
| "reward": 0.19393274933099747, |
| "reward_std": 0.6539879608899355, |
| "rewards/accuracy_reward": 0.20833333767950535, |
| "rewards/cosine_scaled_reward": -0.014400593005120754, |
| "step": 41 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2870.9166774749756, |
| "epoch": 0.048, |
| "grad_norm": 0.10464286059141159, |
| "kl": 7.665157318115234e-05, |
| "learning_rate": 7.612622032536507e-07, |
| "loss": 0.0068, |
| "reward": -0.23928624275140464, |
| "reward_std": 0.23300944967195392, |
| "rewards/accuracy_reward": 0.02083333395421505, |
| "rewards/cosine_scaled_reward": -0.26011957973241806, |
| "step": 42 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2943.6666717529297, |
| "epoch": 0.04914285714285714, |
| "grad_norm": 0.05960472300648689, |
| "kl": 4.680454730987549e-05, |
| "learning_rate": 7.472670160550848e-07, |
| "loss": -0.0069, |
| "reward": -0.09865465015172958, |
| "reward_std": 0.264129894785583, |
| "rewards/accuracy_reward": 0.0833333358168602, |
| "rewards/cosine_scaled_reward": -0.18198798224329948, |
| "step": 43 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2371.2291946411133, |
| "epoch": 0.05028571428571429, |
| "grad_norm": 0.0943843275308609, |
| "kl": 4.684180021286011e-05, |
| "learning_rate": 7.330314893841101e-07, |
| "loss": 0.012, |
| "reward": 0.23478438053280115, |
| "reward_std": 0.5152947697788477, |
| "rewards/accuracy_reward": 0.2083333395421505, |
| "rewards/cosine_scaled_reward": 0.02645104774273932, |
| "step": 44 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3218.7084045410156, |
| "epoch": 0.05142857142857143, |
| "grad_norm": 0.056803327053785324, |
| "kl": 4.836916923522949e-05, |
| "learning_rate": 7.185729670371604e-07, |
| "loss": 0.0238, |
| "reward": 0.06656578462570906, |
| "reward_std": 0.46774430200457573, |
| "rewards/accuracy_reward": 0.14583333767950535, |
| "rewards/cosine_scaled_reward": -0.0792675418779254, |
| "step": 45 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3084.5, |
| "epoch": 0.052571428571428575, |
| "grad_norm": 0.06916145235300064, |
| "kl": 4.89652156829834e-05, |
| "learning_rate": 7.039090644965509e-07, |
| "loss": -0.0137, |
| "reward": -0.15745936054736376, |
| "reward_std": 0.30057619512081146, |
| "rewards/accuracy_reward": 0.0416666679084301, |
| "rewards/cosine_scaled_reward": -0.19912602473050356, |
| "step": 46 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2617.854202270508, |
| "epoch": 0.053714285714285714, |
| "grad_norm": 0.0891217514872551, |
| "kl": 3.8154423236846924e-05, |
| "learning_rate": 6.890576474687263e-07, |
| "loss": 0.0131, |
| "reward": 0.15285246446728706, |
| "reward_std": 0.609043394215405, |
| "rewards/accuracy_reward": 0.2083333395421505, |
| "rewards/cosine_scaled_reward": -0.05548086855560541, |
| "step": 47 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2537.8542098999023, |
| "epoch": 0.054857142857142854, |
| "grad_norm": 0.0810699462890625, |
| "kl": 3.608688712120056e-05, |
| "learning_rate": 6.740368101176495e-07, |
| "loss": 0.0076, |
| "reward": 0.22076446935534477, |
| "reward_std": 0.6259710285812616, |
| "rewards/accuracy_reward": 0.20833333395421505, |
| "rewards/cosine_scaled_reward": 0.012431120034307241, |
| "step": 48 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1916.0625305175781, |
| "epoch": 0.056, |
| "grad_norm": 0.11279745399951935, |
| "kl": 2.9848888516426086e-05, |
| "learning_rate": 6.588648530198504e-07, |
| "loss": -0.0081, |
| "reward": 0.31437641754746437, |
| "reward_std": 0.4195905257947743, |
| "rewards/accuracy_reward": 0.291666679084301, |
| "rewards/cosine_scaled_reward": 0.022709736600518227, |
| "step": 49 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3009.8333740234375, |
| "epoch": 0.05714285714285714, |
| "grad_norm": 0.06987818330526352, |
| "kl": 3.094226121902466e-05, |
| "learning_rate": 6.435602608679916e-07, |
| "loss": -0.0053, |
| "reward": 0.3973438460379839, |
| "reward_std": 0.30630784668028355, |
| "rewards/accuracy_reward": 0.3125, |
| "rewards/cosine_scaled_reward": 0.084843834862113, |
| "step": 50 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2353.6041870117188, |
| "epoch": 0.05828571428571429, |
| "grad_norm": 0.0980035811662674, |
| "kl": 4.871189594268799e-05, |
| "learning_rate": 6.281416799501187e-07, |
| "loss": -0.0059, |
| "reward": -0.023236393928527832, |
| "reward_std": 0.37473731534555554, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/cosine_scaled_reward": -0.14823639206588268, |
| "step": 51 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2840.583366394043, |
| "epoch": 0.05942857142857143, |
| "grad_norm": 0.0852833166718483, |
| "kl": 4.4442713260650635e-05, |
| "learning_rate": 6.126278954320294e-07, |
| "loss": -0.007, |
| "reward": 0.46025677397847176, |
| "reward_std": 0.6503263358026743, |
| "rewards/accuracy_reward": 0.3750000037252903, |
| "rewards/cosine_scaled_reward": 0.0852567870169878, |
| "step": 52 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2741.3958892822266, |
| "epoch": 0.060571428571428575, |
| "grad_norm": 0.08544100821018219, |
| "kl": 5.532801151275635e-05, |
| "learning_rate": 5.97037808470444e-07, |
| "loss": 0.0061, |
| "reward": 0.1790762129239738, |
| "reward_std": 0.6837563626468182, |
| "rewards/accuracy_reward": 0.2291666716337204, |
| "rewards/cosine_scaled_reward": -0.05009045993210748, |
| "step": 53 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2533.4792098999023, |
| "epoch": 0.061714285714285715, |
| "grad_norm": 0.08326774090528488, |
| "kl": 3.949552774429321e-05, |
| "learning_rate": 5.813904131848564e-07, |
| "loss": 0.0983, |
| "reward": 0.6619744710624218, |
| "reward_std": 0.7842050231993198, |
| "rewards/accuracy_reward": 0.45833335258066654, |
| "rewards/cosine_scaled_reward": 0.2036411385051906, |
| "step": 54 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2908.062530517578, |
| "epoch": 0.06285714285714286, |
| "grad_norm": 0.057915616780519485, |
| "kl": 3.5546720027923584e-05, |
| "learning_rate": 5.657047735161255e-07, |
| "loss": 0.0188, |
| "reward": 0.27089552767574787, |
| "reward_std": 0.5088877673260868, |
| "rewards/accuracy_reward": 0.27083333395421505, |
| "rewards/cosine_scaled_reward": 6.21667131781578e-05, |
| "step": 55 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3113.5625228881836, |
| "epoch": 0.064, |
| "grad_norm": 0.06461653858423233, |
| "kl": 3.863126039505005e-05, |
| "learning_rate": 5.5e-07, |
| "loss": -0.0314, |
| "reward": -0.09405255503952503, |
| "reward_std": 0.4130659643560648, |
| "rewards/accuracy_reward": 0.0833333358168602, |
| "rewards/cosine_scaled_reward": -0.17738589039072394, |
| "step": 56 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3084.500030517578, |
| "epoch": 0.06514285714285714, |
| "grad_norm": 0.052722394466400146, |
| "kl": 3.3408403396606445e-05, |
| "learning_rate": 5.342952264838747e-07, |
| "loss": -0.0609, |
| "reward": 0.0041184090077877045, |
| "reward_std": 0.5053744353353977, |
| "rewards/accuracy_reward": 0.1458333358168602, |
| "rewards/cosine_scaled_reward": -0.14171493193134665, |
| "step": 57 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2236.958381652832, |
| "epoch": 0.06628571428571428, |
| "grad_norm": 0.09903010725975037, |
| "kl": 2.588331699371338e-05, |
| "learning_rate": 5.186095868151436e-07, |
| "loss": 0.0531, |
| "reward": 0.36401716619729996, |
| "reward_std": 0.4780782051384449, |
| "rewards/accuracy_reward": 0.3333333395421505, |
| "rewards/cosine_scaled_reward": 0.030683835968375206, |
| "step": 58 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2744.979179382324, |
| "epoch": 0.06742857142857143, |
| "grad_norm": 0.07801861315965652, |
| "kl": 3.4183263778686523e-05, |
| "learning_rate": 5.02962191529556e-07, |
| "loss": 0.0011, |
| "reward": -0.1735086990520358, |
| "reward_std": 0.38626679591834545, |
| "rewards/accuracy_reward": 0.0833333358168602, |
| "rewards/cosine_scaled_reward": -0.2568420314928517, |
| "step": 59 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2849.458366394043, |
| "epoch": 0.06857142857142857, |
| "grad_norm": 0.06434936821460724, |
| "kl": 3.138929605484009e-05, |
| "learning_rate": 4.873721045679706e-07, |
| "loss": 0.0188, |
| "reward": -0.17126354575157166, |
| "reward_std": 0.2597244749777019, |
| "rewards/accuracy_reward": 0.0416666679084301, |
| "rewards/cosine_scaled_reward": -0.21293021738529205, |
| "step": 60 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2868.1042404174805, |
| "epoch": 0.06971428571428571, |
| "grad_norm": 0.058776307851076126, |
| "kl": 2.8677284717559814e-05, |
| "learning_rate": 4.7185832004988133e-07, |
| "loss": 0.0473, |
| "reward": 0.13987798150628805, |
| "reward_std": 0.4987390795722604, |
| "rewards/accuracy_reward": 0.1875000037252903, |
| "rewards/cosine_scaled_reward": -0.04762201849371195, |
| "step": 61 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2286.62504196167, |
| "epoch": 0.07085714285714285, |
| "grad_norm": 0.07639817148447037, |
| "kl": 1.909933052957058e-05, |
| "learning_rate": 4.5643973913200837e-07, |
| "loss": -0.0389, |
| "reward": 0.2486275490373373, |
| "reward_std": 0.3404446765780449, |
| "rewards/accuracy_reward": 0.25000000558793545, |
| "rewards/cosine_scaled_reward": -0.0013724502641707659, |
| "step": 62 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1874.6875457763672, |
| "epoch": 0.072, |
| "grad_norm": 0.08694759756326675, |
| "kl": 3.208965063095093e-05, |
| "learning_rate": 4.4113514698014953e-07, |
| "loss": 0.0497, |
| "reward": 0.283432574942708, |
| "reward_std": 0.6469337809830904, |
| "rewards/accuracy_reward": 0.27083333767950535, |
| "rewards/cosine_scaled_reward": 0.012599228648468852, |
| "step": 63 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2791.3750228881836, |
| "epoch": 0.07314285714285715, |
| "grad_norm": 0.058527979999780655, |
| "kl": 4.375725984573364e-05, |
| "learning_rate": 4.2596318988235037e-07, |
| "loss": -0.0143, |
| "reward": 0.11854150518774986, |
| "reward_std": 0.557681780308485, |
| "rewards/accuracy_reward": 0.18750000558793545, |
| "rewards/cosine_scaled_reward": -0.06895849623833783, |
| "step": 64 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2645.5625228881836, |
| "epoch": 0.07428571428571429, |
| "grad_norm": 0.07280135899782181, |
| "kl": 2.8699636459350586e-05, |
| "learning_rate": 4.1094235253127374e-07, |
| "loss": -0.0051, |
| "reward": 0.27093657973455265, |
| "reward_std": 0.2569840718060732, |
| "rewards/accuracy_reward": 0.27083333395421505, |
| "rewards/cosine_scaled_reward": 0.00010325387120246887, |
| "step": 65 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2068.145839691162, |
| "epoch": 0.07542857142857143, |
| "grad_norm": 0.13709966838359833, |
| "kl": 2.857297658920288e-05, |
| "learning_rate": 3.9609093550344907e-07, |
| "loss": -0.0312, |
| "reward": 0.37133984826505184, |
| "reward_std": 0.4443043200299144, |
| "rewards/accuracy_reward": 0.3541666753590107, |
| "rewards/cosine_scaled_reward": 0.01717321015894413, |
| "step": 66 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3338.5416870117188, |
| "epoch": 0.07657142857142857, |
| "grad_norm": 0.04955720156431198, |
| "kl": 2.9280781745910645e-05, |
| "learning_rate": 3.8142703296283953e-07, |
| "loss": 0.0486, |
| "reward": -0.2874361127614975, |
| "reward_std": 0.17634079419076443, |
| "rewards/accuracy_reward": 0.0, |
| "rewards/cosine_scaled_reward": -0.28743611462414265, |
| "step": 67 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1655.2916793823242, |
| "epoch": 0.07771428571428571, |
| "grad_norm": 0.10328181087970734, |
| "kl": 3.488361835479736e-05, |
| "learning_rate": 3.6696851061588994e-07, |
| "loss": 0.0563, |
| "reward": 0.04894767206860706, |
| "reward_std": 0.46508038230240345, |
| "rewards/accuracy_reward": 0.16666666977107525, |
| "rewards/cosine_scaled_reward": -0.11771899089217186, |
| "step": 68 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2115.520866394043, |
| "epoch": 0.07885714285714286, |
| "grad_norm": 0.08769083023071289, |
| "kl": 5.054473876953125e-05, |
| "learning_rate": 3.5273298394491515e-07, |
| "loss": 0.0664, |
| "reward": -0.21063880110159516, |
| "reward_std": 0.22452263766899705, |
| "rewards/accuracy_reward": 0.02083333395421505, |
| "rewards/cosine_scaled_reward": -0.23147212620824575, |
| "step": 69 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2957.250045776367, |
| "epoch": 0.08, |
| "grad_norm": 0.07507078349590302, |
| "kl": 0.00010189414024353027, |
| "learning_rate": 3.387377967463493e-07, |
| "loss": 0.0422, |
| "reward": 0.047607121989130974, |
| "reward_std": 0.4603970441967249, |
| "rewards/accuracy_reward": 0.14583333395421505, |
| "rewards/cosine_scaled_reward": -0.09822620265185833, |
| "step": 70 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2741.500011444092, |
| "epoch": 0.08114285714285714, |
| "grad_norm": 0.0938272476196289, |
| "kl": 3.9443373680114746e-05, |
| "learning_rate": 3.250000000000001e-07, |
| "loss": -0.0011, |
| "reward": -0.011160964146256447, |
| "reward_std": 0.43730730563402176, |
| "rewards/accuracy_reward": 0.0833333358168602, |
| "rewards/cosine_scaled_reward": -0.09449429312371649, |
| "step": 71 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2641.395835876465, |
| "epoch": 0.08228571428571428, |
| "grad_norm": 0.08971308171749115, |
| "kl": 4.0397047996520996e-05, |
| "learning_rate": 3.115363310950578e-07, |
| "loss": -0.03, |
| "reward": -0.12160800583660603, |
| "reward_std": 0.33957473933696747, |
| "rewards/accuracy_reward": 0.0416666679084301, |
| "rewards/cosine_scaled_reward": -0.16327467001974583, |
| "step": 72 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3444.104217529297, |
| "epoch": 0.08342857142857144, |
| "grad_norm": 0.047388430684804916, |
| "kl": 3.725290298461914e-05, |
| "learning_rate": 2.9836319343816397e-07, |
| "loss": 0.0397, |
| "reward": 0.0910368449985981, |
| "reward_std": 0.47095555253326893, |
| "rewards/accuracy_reward": 0.1666666716337204, |
| "rewards/cosine_scaled_reward": -0.07562982058152556, |
| "step": 73 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2376.2916946411133, |
| "epoch": 0.08457142857142858, |
| "grad_norm": 0.09753943979740143, |
| "kl": 6.413459777832031e-05, |
| "learning_rate": 2.854966364683872e-07, |
| "loss": 0.1156, |
| "reward": 0.36098406091332436, |
| "reward_std": 0.521298123523593, |
| "rewards/accuracy_reward": 0.2916666679084301, |
| "rewards/cosine_scaled_reward": 0.06931740138679743, |
| "step": 74 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2761.0208587646484, |
| "epoch": 0.08571428571428572, |
| "grad_norm": 0.05892053619027138, |
| "kl": 3.839470446109772e-05, |
| "learning_rate": 2.729523361034538e-07, |
| "loss": -0.0033, |
| "reward": 0.2974963430315256, |
| "reward_std": 0.32638865895569324, |
| "rewards/accuracy_reward": 0.25000000558793545, |
| "rewards/cosine_scaled_reward": 0.047496337443590164, |
| "step": 75 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2590.500045776367, |
| "epoch": 0.08685714285714285, |
| "grad_norm": 0.11792360991239548, |
| "kl": 3.809481859207153e-05, |
| "learning_rate": 2.6074557564105724e-07, |
| "loss": -0.0438, |
| "reward": -0.22247323859483004, |
| "reward_std": 0.22486078180372715, |
| "rewards/accuracy_reward": 0.02083333395421505, |
| "rewards/cosine_scaled_reward": -0.2433065790683031, |
| "step": 76 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2898.000030517578, |
| "epoch": 0.088, |
| "grad_norm": 0.06427934020757675, |
| "kl": 4.331767559051514e-05, |
| "learning_rate": 2.488912271385139e-07, |
| "loss": 0.0008, |
| "reward": -0.0680793123319745, |
| "reward_std": 0.2837136909365654, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/cosine_scaled_reward": -0.1305793197825551, |
| "step": 77 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3283.354217529297, |
| "epoch": 0.08914285714285715, |
| "grad_norm": 0.05109477415680885, |
| "kl": 3.820657730102539e-05, |
| "learning_rate": 2.374037332934512e-07, |
| "loss": 0.01, |
| "reward": 0.009964404162019491, |
| "reward_std": 0.5483912099152803, |
| "rewards/accuracy_reward": 0.1458333358168602, |
| "rewards/cosine_scaled_reward": -0.13586892932653427, |
| "step": 78 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2252.250045776367, |
| "epoch": 0.09028571428571429, |
| "grad_norm": 0.0837658941745758, |
| "kl": 2.925097942352295e-05, |
| "learning_rate": 2.2629708984760706e-07, |
| "loss": -0.004, |
| "reward": 0.2518207300454378, |
| "reward_std": 0.42728718742728233, |
| "rewards/accuracy_reward": 0.2500000037252903, |
| "rewards/cosine_scaled_reward": 0.0018207323737442493, |
| "step": 79 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3298.041702270508, |
| "epoch": 0.09142857142857143, |
| "grad_norm": 0.057548362761735916, |
| "kl": 4.651397466659546e-05, |
| "learning_rate": 2.1558482853517253e-07, |
| "loss": -0.005, |
| "reward": -0.07602027803659439, |
| "reward_std": 0.3821410443633795, |
| "rewards/accuracy_reward": 0.10416666977107525, |
| "rewards/cosine_scaled_reward": -0.18018695712089539, |
| "step": 80 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3132.81254196167, |
| "epoch": 0.09257142857142857, |
| "grad_norm": 0.07149508595466614, |
| "kl": 5.067884922027588e-05, |
| "learning_rate": 2.0528000059645995e-07, |
| "loss": 0.0055, |
| "reward": 0.011346326675266027, |
| "reward_std": 0.5589314438402653, |
| "rewards/accuracy_reward": 0.1458333358168602, |
| "rewards/cosine_scaled_reward": -0.13448702194727957, |
| "step": 81 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2614.4166870117188, |
| "epoch": 0.09371428571428571, |
| "grad_norm": 0.07767566293478012, |
| "kl": 4.256516695022583e-05, |
| "learning_rate": 1.9539516087697517e-07, |
| "loss": 0.0811, |
| "reward": 0.0747821144759655, |
| "reward_std": 0.6653982885181904, |
| "rewards/accuracy_reward": 0.1666666716337204, |
| "rewards/cosine_scaled_reward": -0.0918845571577549, |
| "step": 82 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2527.520835876465, |
| "epoch": 0.09485714285714286, |
| "grad_norm": 0.097532257437706, |
| "kl": 4.713237285614014e-05, |
| "learning_rate": 1.8594235253127372e-07, |
| "loss": 0.025, |
| "reward": 0.1644680369645357, |
| "reward_std": 0.38445328176021576, |
| "rewards/accuracy_reward": 0.18750000186264515, |
| "rewards/cosine_scaled_reward": -0.023031978867948055, |
| "step": 83 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2790.479217529297, |
| "epoch": 0.096, |
| "grad_norm": 0.06318508833646774, |
| "kl": 3.428757190704346e-05, |
| "learning_rate": 1.7693309235023127e-07, |
| "loss": 0.0002, |
| "reward": 0.28921602852642536, |
| "reward_std": 0.5463491454720497, |
| "rewards/accuracy_reward": 0.291666679084301, |
| "rewards/cosine_scaled_reward": -0.002450621104799211, |
| "step": 84 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2910.7084045410156, |
| "epoch": 0.09714285714285714, |
| "grad_norm": 0.05188002064824104, |
| "kl": 3.795698285102844e-05, |
| "learning_rate": 1.6837835672960831e-07, |
| "loss": -0.0004, |
| "reward": -0.15216440707445145, |
| "reward_std": 0.3802374005317688, |
| "rewards/accuracy_reward": 0.06250000186264515, |
| "rewards/cosine_scaled_reward": -0.21466441452503204, |
| "step": 85 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2774.500045776367, |
| "epoch": 0.09828571428571428, |
| "grad_norm": 0.08824881166219711, |
| "kl": 4.828721284866333e-05, |
| "learning_rate": 1.6028856829700258e-07, |
| "loss": 0.0431, |
| "reward": -0.10176742170006037, |
| "reward_std": 0.28106818813830614, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/cosine_scaled_reward": -0.16426740679889917, |
| "step": 86 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2453.0000381469727, |
| "epoch": 0.09942857142857142, |
| "grad_norm": 0.07480672001838684, |
| "kl": 4.8995018005371094e-05, |
| "learning_rate": 1.5267358321348285e-07, |
| "loss": 0.078, |
| "reward": 0.26724753249436617, |
| "reward_std": 0.574736475944519, |
| "rewards/accuracy_reward": 0.25000000931322575, |
| "rewards/cosine_scaled_reward": 0.017247509211301804, |
| "step": 87 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2125.4583435058594, |
| "epoch": 0.10057142857142858, |
| "grad_norm": 0.10394497215747833, |
| "kl": 3.460049629211426e-05, |
| "learning_rate": 1.4554267916537495e-07, |
| "loss": 0.1439, |
| "reward": 0.4671774273738265, |
| "reward_std": 0.8694365136325359, |
| "rewards/accuracy_reward": 0.33333334513008595, |
| "rewards/cosine_scaled_reward": 0.13384409341961145, |
| "step": 88 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2954.4583740234375, |
| "epoch": 0.10171428571428572, |
| "grad_norm": 0.05468936264514923, |
| "kl": 3.0003488063812256e-05, |
| "learning_rate": 1.3890454406082956e-07, |
| "loss": 0.0426, |
| "reward": 0.09598893485963345, |
| "reward_std": 0.49872371926903725, |
| "rewards/accuracy_reward": 0.1666666679084301, |
| "rewards/cosine_scaled_reward": -0.07067773153539747, |
| "step": 89 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2314.6666946411133, |
| "epoch": 0.10285714285714286, |
| "grad_norm": 0.11714471131563187, |
| "kl": 5.67510724067688e-05, |
| "learning_rate": 1.3276726544494571e-07, |
| "loss": -0.028, |
| "reward": -0.16322916094213724, |
| "reward_std": 0.30263339448720217, |
| "rewards/accuracy_reward": 0.0416666679084301, |
| "rewards/cosine_scaled_reward": -0.20489584188908339, |
| "step": 90 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2949.854202270508, |
| "epoch": 0.104, |
| "grad_norm": 0.06450404226779938, |
| "kl": 4.634261131286621e-05, |
| "learning_rate": 1.2713832064634125e-07, |
| "loss": 0.0308, |
| "reward": 0.009420277085155249, |
| "reward_std": 0.4902005009353161, |
| "rewards/accuracy_reward": 0.14583333767950535, |
| "rewards/cosine_scaled_reward": -0.13641306199133396, |
| "step": 91 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2546.2083740234375, |
| "epoch": 0.10514285714285715, |
| "grad_norm": 0.08860377222299576, |
| "kl": 3.190338611602783e-05, |
| "learning_rate": 1.220245676671809e-07, |
| "loss": 0.0264, |
| "reward": -0.07203258201479912, |
| "reward_std": 0.37115267012268305, |
| "rewards/accuracy_reward": 0.06250000186264515, |
| "rewards/cosine_scaled_reward": -0.134532586671412, |
| "step": 92 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3584.0, |
| "epoch": 0.10628571428571429, |
| "grad_norm": 0.05677548050880432, |
| "kl": 5.491077899932861e-05, |
| "learning_rate": 1.1743223682775649e-07, |
| "loss": -0.0, |
| "reward": -0.2040251363068819, |
| "reward_std": 0.14581821858882904, |
| "rewards/accuracy_reward": 0.0, |
| "rewards/cosine_scaled_reward": -0.20402513444423676, |
| "step": 93 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2474.3333740234375, |
| "epoch": 0.10742857142857143, |
| "grad_norm": 0.0738380178809166, |
| "kl": 5.2111921831965446e-05, |
| "learning_rate": 1.1336692317580158e-07, |
| "loss": -0.0028, |
| "reward": 0.18377637607045472, |
| "reward_std": 0.35242756828665733, |
| "rewards/accuracy_reward": 0.20833333395421505, |
| "rewards/cosine_scaled_reward": -0.024556951597332954, |
| "step": 94 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3511.8958435058594, |
| "epoch": 0.10857142857142857, |
| "grad_norm": 0.04296368733048439, |
| "kl": 3.476440906524658e-05, |
| "learning_rate": 1.0983357966978745e-07, |
| "loss": 0.0, |
| "reward": 0.0025116736069321632, |
| "reward_std": 0.38779854215681553, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/cosine_scaled_reward": -0.12248833384364843, |
| "step": 95 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2279.9583740234375, |
| "epoch": 0.10971428571428571, |
| "grad_norm": 0.08072325587272644, |
| "kl": 3.765523433685303e-05, |
| "learning_rate": 1.068365111445064e-07, |
| "loss": -0.006, |
| "reward": -0.024050889536738396, |
| "reward_std": 0.4769116332754493, |
| "rewards/accuracy_reward": 0.14583333767950535, |
| "rewards/cosine_scaled_reward": -0.16988423094153404, |
| "step": 96 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3010.7708740234375, |
| "epoch": 0.11085714285714286, |
| "grad_norm": 0.06400413811206818, |
| "kl": 3.889948129653931e-05, |
| "learning_rate": 1.0437936906629334e-07, |
| "loss": 0.0745, |
| "reward": 0.06319739483296871, |
| "reward_std": 0.37303004786372185, |
| "rewards/accuracy_reward": 0.1666666716337204, |
| "rewards/cosine_scaled_reward": -0.10346927866339684, |
| "step": 97 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2826.0208435058594, |
| "epoch": 0.112, |
| "grad_norm": 0.05574605613946915, |
| "kl": 3.8623809814453125e-05, |
| "learning_rate": 1.0246514708427701e-07, |
| "loss": 0.1032, |
| "reward": 0.11388520710170269, |
| "reward_std": 0.3436102643609047, |
| "rewards/accuracy_reward": 0.18750000186264515, |
| "rewards/cosine_scaled_reward": -0.07361481338739395, |
| "step": 98 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2823.6250076293945, |
| "epoch": 0.11314285714285714, |
| "grad_norm": 0.090733103454113, |
| "kl": 3.966689109802246e-05, |
| "learning_rate": 1.0109617738307911e-07, |
| "loss": 0.0091, |
| "reward": 0.06398104969412088, |
| "reward_std": 0.5714576002210379, |
| "rewards/accuracy_reward": 0.1666666679084301, |
| "rewards/cosine_scaled_reward": -0.10268562845885754, |
| "step": 99 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2570.083381652832, |
| "epoch": 0.11428571428571428, |
| "grad_norm": 0.07998774200677872, |
| "kl": 4.67151403427124e-05, |
| "learning_rate": 1.002741278414069e-07, |
| "loss": -0.0209, |
| "reward": 0.07360539212822914, |
| "reward_std": 0.48094008676707745, |
| "rewards/accuracy_reward": 0.14583333767950535, |
| "rewards/cosine_scaled_reward": -0.07222793623805046, |
| "step": 100 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2722.187515258789, |
| "epoch": 0.11542857142857142, |
| "grad_norm": 0.07589790225028992, |
| "kl": 4.192441701889038e-05, |
| "learning_rate": 1e-07, |
| "loss": 0.069, |
| "reward": 0.08873439207673073, |
| "reward_std": 0.2866000607609749, |
| "rewards/accuracy_reward": 0.1875, |
| "rewards/cosine_scaled_reward": -0.09876561164855957, |
| "step": 101 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1838.4166946411133, |
| "epoch": 0.11657142857142858, |
| "grad_norm": 0.09402992576360703, |
| "kl": 3.657490015029907e-05, |
| "learning_rate": 6.203955092681039e-07, |
| "loss": 0.0156, |
| "reward": 0.06690541142597795, |
| "reward_std": 0.18542613834142685, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/cosine_scaled_reward": -0.05809460300952196, |
| "step": 102 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2870.6667137145996, |
| "epoch": 0.11771428571428572, |
| "grad_norm": 0.07137490063905716, |
| "kl": 4.7534704208374023e-05, |
| "learning_rate": 6.126278954320294e-07, |
| "loss": -0.0495, |
| "reward": 0.24454372422769666, |
| "reward_std": 0.4555186741054058, |
| "rewards/accuracy_reward": 0.27083333395421505, |
| "rewards/cosine_scaled_reward": -0.026289615780115128, |
| "step": 103 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2432.3333778381348, |
| "epoch": 0.11885714285714286, |
| "grad_norm": 0.09176477044820786, |
| "kl": 3.963988274335861e-05, |
| "learning_rate": 6.048412045323164e-07, |
| "loss": 0.0414, |
| "reward": 0.011347562074661255, |
| "reward_std": 0.4153030626475811, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/cosine_scaled_reward": -0.11365244910120964, |
| "step": 104 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2421.7500381469727, |
| "epoch": 0.12, |
| "grad_norm": 0.08922665566205978, |
| "kl": 3.612414002418518e-05, |
| "learning_rate": 5.97037808470444e-07, |
| "loss": 0.1048, |
| "reward": 0.26649707183241844, |
| "reward_std": 0.5726894475519657, |
| "rewards/accuracy_reward": 0.25000000931322575, |
| "rewards/cosine_scaled_reward": 0.016497071366757154, |
| "step": 105 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2394.250015258789, |
| "epoch": 0.12114285714285715, |
| "grad_norm": 0.08171434700489044, |
| "kl": 1.9472092390060425e-05, |
| "learning_rate": 5.892200842364462e-07, |
| "loss": -0.0643, |
| "reward": 0.6783079504966736, |
| "reward_std": 0.5645040161907673, |
| "rewards/accuracy_reward": 0.479166679084301, |
| "rewards/cosine_scaled_reward": 0.19914129562675953, |
| "step": 106 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2945.1041717529297, |
| "epoch": 0.12228571428571429, |
| "grad_norm": 0.07043629884719849, |
| "kl": 3.892183303833008e-05, |
| "learning_rate": 5.813904131848564e-07, |
| "loss": -0.0082, |
| "reward": 0.09549596160650253, |
| "reward_std": 0.38008159026503563, |
| "rewards/accuracy_reward": 0.1666666716337204, |
| "rewards/cosine_scaled_reward": -0.07117070630192757, |
| "step": 107 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2612.541732788086, |
| "epoch": 0.12342857142857143, |
| "grad_norm": 0.08020760118961334, |
| "kl": 6.224215030670166e-05, |
| "learning_rate": 5.735511803093248e-07, |
| "loss": 0.0319, |
| "reward": -0.06708686612546444, |
| "reward_std": 0.372654527425766, |
| "rewards/accuracy_reward": 0.1041666679084301, |
| "rewards/cosine_scaled_reward": -0.17125353403389454, |
| "step": 108 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3040.041679382324, |
| "epoch": 0.12457142857142857, |
| "grad_norm": 0.05782421678304672, |
| "kl": 3.9560720324516296e-05, |
| "learning_rate": 5.657047735161255e-07, |
| "loss": -0.0298, |
| "reward": 0.06678529269993305, |
| "reward_std": 0.23496104590594769, |
| "rewards/accuracy_reward": 0.14583333395421505, |
| "rewards/cosine_scaled_reward": -0.079048041254282, |
| "step": 109 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2703.375030517578, |
| "epoch": 0.12571428571428572, |
| "grad_norm": 0.07456289976835251, |
| "kl": 4.085153341293335e-05, |
| "learning_rate": 5.578535828967777e-07, |
| "loss": 0.0608, |
| "reward": 0.09353478881530464, |
| "reward_std": 0.6213366910815239, |
| "rewards/accuracy_reward": 0.2083333358168602, |
| "rewards/cosine_scaled_reward": -0.11479855328798294, |
| "step": 110 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3046.000045776367, |
| "epoch": 0.12685714285714286, |
| "grad_norm": 0.06936592608690262, |
| "kl": 4.996359348297119e-05, |
| "learning_rate": 5.5e-07, |
| "loss": 0.0529, |
| "reward": 0.04783104546368122, |
| "reward_std": 0.406554002314806, |
| "rewards/accuracy_reward": 0.1458333358168602, |
| "rewards/cosine_scaled_reward": -0.09800229035317898, |
| "step": 111 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3252.4791870117188, |
| "epoch": 0.128, |
| "grad_norm": 0.056168291717767715, |
| "kl": 4.392117261886597e-05, |
| "learning_rate": 5.421464171032224e-07, |
| "loss": -0.0406, |
| "reward": 0.160230646841228, |
| "reward_std": 0.3509800494648516, |
| "rewards/accuracy_reward": 0.18750000186264515, |
| "rewards/cosine_scaled_reward": -0.027269369922578335, |
| "step": 112 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2445.0833435058594, |
| "epoch": 0.12914285714285714, |
| "grad_norm": 0.09087050706148148, |
| "kl": 5.398690700531006e-05, |
| "learning_rate": 5.342952264838747e-07, |
| "loss": 0.0283, |
| "reward": -0.02850266359746456, |
| "reward_std": 0.3581254305317998, |
| "rewards/accuracy_reward": 0.08333333395421505, |
| "rewards/cosine_scaled_reward": -0.1118359980173409, |
| "step": 113 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2361.2708854675293, |
| "epoch": 0.13028571428571428, |
| "grad_norm": 0.0836726725101471, |
| "kl": 2.1731480956077576e-05, |
| "learning_rate": 5.264488196906752e-07, |
| "loss": 0.0029, |
| "reward": -0.08279290050268173, |
| "reward_std": 0.2787897954694927, |
| "rewards/accuracy_reward": 0.0833333358168602, |
| "rewards/cosine_scaled_reward": -0.16612621676176786, |
| "step": 114 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2874.3333435058594, |
| "epoch": 0.13142857142857142, |
| "grad_norm": 0.0852808952331543, |
| "kl": 3.896281123161316e-05, |
| "learning_rate": 5.186095868151436e-07, |
| "loss": -0.0246, |
| "reward": 0.2715425807982683, |
| "reward_std": 0.3593301521614194, |
| "rewards/accuracy_reward": 0.2916666716337204, |
| "rewards/cosine_scaled_reward": -0.02012409595772624, |
| "step": 115 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3274.2708435058594, |
| "epoch": 0.13257142857142856, |
| "grad_norm": 0.05964406952261925, |
| "kl": 4.2632222175598145e-05, |
| "learning_rate": 5.107799157635538e-07, |
| "loss": -0.0429, |
| "reward": -0.0861705094575882, |
| "reward_std": 0.2727812984958291, |
| "rewards/accuracy_reward": 0.0833333358168602, |
| "rewards/cosine_scaled_reward": -0.16950384341180325, |
| "step": 116 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3269.625, |
| "epoch": 0.1337142857142857, |
| "grad_norm": 0.05711478367447853, |
| "kl": 5.498528480529785e-05, |
| "learning_rate": 5.02962191529556e-07, |
| "loss": 0.0143, |
| "reward": -0.2802669182419777, |
| "reward_std": 0.18606608174741268, |
| "rewards/accuracy_reward": 0.0, |
| "rewards/cosine_scaled_reward": -0.2802669182419777, |
| "step": 117 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2989.854202270508, |
| "epoch": 0.13485714285714287, |
| "grad_norm": 0.05973963439464569, |
| "kl": 3.249943256378174e-05, |
| "learning_rate": 4.951587954676837e-07, |
| "loss": 0.0042, |
| "reward": 0.4425080083310604, |
| "reward_std": 0.6226732302457094, |
| "rewards/accuracy_reward": 0.354166679084301, |
| "rewards/cosine_scaled_reward": 0.0883413702249527, |
| "step": 118 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1957.4583740234375, |
| "epoch": 0.136, |
| "grad_norm": 0.09650486707687378, |
| "kl": 5.301833152770996e-05, |
| "learning_rate": 4.873721045679706e-07, |
| "loss": 0.0763, |
| "reward": 0.2231588363647461, |
| "reward_std": 0.4853620417416096, |
| "rewards/accuracy_reward": 0.22916666977107525, |
| "rewards/cosine_scaled_reward": -0.006007825839333236, |
| "step": 119 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2861.604179382324, |
| "epoch": 0.13714285714285715, |
| "grad_norm": 0.08811937272548676, |
| "kl": 4.668533802032471e-05, |
| "learning_rate": 4.79604490731896e-07, |
| "loss": 0.069, |
| "reward": 0.18268701434135437, |
| "reward_std": 0.5529882707633078, |
| "rewards/accuracy_reward": 0.20833334140479565, |
| "rewards/cosine_scaled_reward": -0.025646327529102564, |
| "step": 120 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2120.895881652832, |
| "epoch": 0.1382857142857143, |
| "grad_norm": 0.08887775242328644, |
| "kl": 3.911927342414856e-05, |
| "learning_rate": 4.7185832004988133e-07, |
| "loss": -0.0243, |
| "reward": 0.0723479799926281, |
| "reward_std": 0.4600833263248205, |
| "rewards/accuracy_reward": 0.14583333767950535, |
| "rewards/cosine_scaled_reward": -0.07348535116761923, |
| "step": 121 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3076.645866394043, |
| "epoch": 0.13942857142857143, |
| "grad_norm": 0.07256121933460236, |
| "kl": 4.369020462036133e-05, |
| "learning_rate": 4.641359520805548e-07, |
| "loss": 0.0058, |
| "reward": 0.306396946310997, |
| "reward_std": 0.5766899082809687, |
| "rewards/accuracy_reward": 0.2500000074505806, |
| "rewards/cosine_scaled_reward": 0.05639694258570671, |
| "step": 122 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2826.8750228881836, |
| "epoch": 0.14057142857142857, |
| "grad_norm": 0.06723114103078842, |
| "kl": 4.945695400238037e-05, |
| "learning_rate": 4.5643973913200837e-07, |
| "loss": -0.0474, |
| "reward": -0.022131433710455894, |
| "reward_std": 0.3866724129766226, |
| "rewards/accuracy_reward": 0.1458333358168602, |
| "rewards/cosine_scaled_reward": -0.16796477201114612, |
| "step": 123 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2317.583354949951, |
| "epoch": 0.1417142857142857, |
| "grad_norm": 0.08971195667982101, |
| "kl": 3.826618194580078e-05, |
| "learning_rate": 4.4877202554526084e-07, |
| "loss": 0.0437, |
| "reward": 0.25669101858511567, |
| "reward_std": 0.4173083985224366, |
| "rewards/accuracy_reward": 0.29166667349636555, |
| "rewards/cosine_scaled_reward": -0.034975672140717506, |
| "step": 124 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2852.875030517578, |
| "epoch": 0.14285714285714285, |
| "grad_norm": 0.060154370963573456, |
| "kl": 3.0517578125e-05, |
| "learning_rate": 4.4113514698014953e-07, |
| "loss": -0.0046, |
| "reward": 0.3323164558969438, |
| "reward_std": 0.5723556145094335, |
| "rewards/accuracy_reward": 0.31250000558793545, |
| "rewards/cosine_scaled_reward": 0.019816441694274545, |
| "step": 125 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2835.3750610351562, |
| "epoch": 0.144, |
| "grad_norm": 0.06273169815540314, |
| "kl": 2.740137279033661e-05, |
| "learning_rate": 4.3353142970386557e-07, |
| "loss": 0.0187, |
| "reward": 0.07377211796119809, |
| "reward_std": 0.414914159104228, |
| "rewards/accuracy_reward": 0.1458333395421505, |
| "rewards/cosine_scaled_reward": -0.07206120900809765, |
| "step": 126 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3493.5, |
| "epoch": 0.14514285714285713, |
| "grad_norm": 0.051632124930620193, |
| "kl": 3.935769200325012e-05, |
| "learning_rate": 4.2596318988235037e-07, |
| "loss": 0.0178, |
| "reward": -0.10274199862033129, |
| "reward_std": 0.32821359671652317, |
| "rewards/accuracy_reward": 0.0416666679084301, |
| "rewards/cosine_scaled_reward": -0.14440866466611624, |
| "step": 127 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2737.229217529297, |
| "epoch": 0.1462857142857143, |
| "grad_norm": 0.06725908070802689, |
| "kl": 2.730637788772583e-05, |
| "learning_rate": 4.1843273287476854e-07, |
| "loss": 0.0261, |
| "reward": 0.4665216477587819, |
| "reward_std": 0.5292752608656883, |
| "rewards/accuracy_reward": 0.3333333395421505, |
| "rewards/cosine_scaled_reward": 0.13318830379284918, |
| "step": 128 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3285.9791870117188, |
| "epoch": 0.14742857142857144, |
| "grad_norm": 0.059827111661434174, |
| "kl": 4.4018030166625977e-05, |
| "learning_rate": 4.1094235253127374e-07, |
| "loss": 0.0326, |
| "reward": -0.1473593506962061, |
| "reward_std": 0.34279950708150864, |
| "rewards/accuracy_reward": 0.06250000186264515, |
| "rewards/cosine_scaled_reward": -0.20985935255885124, |
| "step": 129 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3269.8958435058594, |
| "epoch": 0.14857142857142858, |
| "grad_norm": 0.0539652518928051, |
| "kl": 4.6581029891967773e-05, |
| "learning_rate": 4.034943304942796e-07, |
| "loss": -0.0338, |
| "reward": 0.012523974291980267, |
| "reward_std": 0.3652627067640424, |
| "rewards/accuracy_reward": 0.10416666977107525, |
| "rewards/cosine_scaled_reward": -0.09164270292967558, |
| "step": 130 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2738.0416946411133, |
| "epoch": 0.14971428571428572, |
| "grad_norm": 0.07965836673974991, |
| "kl": 3.647804260253906e-05, |
| "learning_rate": 3.9609093550344907e-07, |
| "loss": 0.0134, |
| "reward": 0.40422316640615463, |
| "reward_std": 0.4564414999913424, |
| "rewards/accuracy_reward": 0.3750000074505806, |
| "rewards/cosine_scaled_reward": 0.029223157092928886, |
| "step": 131 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2821.604179382324, |
| "epoch": 0.15085714285714286, |
| "grad_norm": 0.06730424612760544, |
| "kl": 3.738701343536377e-05, |
| "learning_rate": 3.8873442270461485e-07, |
| "loss": -0.0595, |
| "reward": 0.14993381313979626, |
| "reward_std": 0.24461174756288528, |
| "rewards/accuracy_reward": 0.2291666716337204, |
| "rewards/cosine_scaled_reward": -0.07923284359276295, |
| "step": 132 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3298.9166870117188, |
| "epoch": 0.152, |
| "grad_norm": 0.0547853484749794, |
| "kl": 3.6306679248809814e-05, |
| "learning_rate": 3.8142703296283953e-07, |
| "loss": -0.0196, |
| "reward": -0.0262713935226202, |
| "reward_std": 0.28523722756654024, |
| "rewards/accuracy_reward": 0.1875, |
| "rewards/cosine_scaled_reward": -0.21377139631658792, |
| "step": 133 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2631.020851135254, |
| "epoch": 0.15314285714285714, |
| "grad_norm": 0.09167379140853882, |
| "kl": 5.6549906730651855e-05, |
| "learning_rate": 3.7417099217982686e-07, |
| "loss": -0.0388, |
| "reward": 0.13690885063260794, |
| "reward_std": 0.33062932174652815, |
| "rewards/accuracy_reward": 0.18750000186264515, |
| "rewards/cosine_scaled_reward": -0.05059114011237398, |
| "step": 134 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1884.020866394043, |
| "epoch": 0.15428571428571428, |
| "grad_norm": 0.09686336666345596, |
| "kl": 4.8333313316106796e-05, |
| "learning_rate": 3.6696851061588994e-07, |
| "loss": 0.0075, |
| "reward": 0.5446690749377012, |
| "reward_std": 0.5788980075158179, |
| "rewards/accuracy_reward": 0.41666666977107525, |
| "rewards/cosine_scaled_reward": 0.12800239364150912, |
| "step": 135 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2724.2083740234375, |
| "epoch": 0.15542857142857142, |
| "grad_norm": 0.06881242245435715, |
| "kl": 3.3229589462280273e-05, |
| "learning_rate": 3.5982178221668533e-07, |
| "loss": -0.0172, |
| "reward": 0.25056808441877365, |
| "reward_std": 0.6991038620471954, |
| "rewards/accuracy_reward": 0.2500000037252903, |
| "rewards/cosine_scaled_reward": 0.0005680816248059273, |
| "step": 136 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3288.2916717529297, |
| "epoch": 0.15657142857142858, |
| "grad_norm": 0.04798026755452156, |
| "kl": 3.696233034133911e-05, |
| "learning_rate": 3.5273298394491515e-07, |
| "loss": -0.0012, |
| "reward": -0.2087622880935669, |
| "reward_std": 0.2625069562345743, |
| "rewards/accuracy_reward": 0.0416666679084301, |
| "rewards/cosine_scaled_reward": -0.25042895757360384, |
| "step": 137 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2845.5416946411133, |
| "epoch": 0.15771428571428572, |
| "grad_norm": 0.06401628255844116, |
| "kl": 2.7161091566085815e-05, |
| "learning_rate": 3.45704275117204e-07, |
| "loss": -0.007, |
| "reward": 0.04433951433748007, |
| "reward_std": 0.16650092136114836, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/cosine_scaled_reward": -0.08066049311310053, |
| "step": 138 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2961.604202270508, |
| "epoch": 0.15885714285714286, |
| "grad_norm": 0.06384900212287903, |
| "kl": 4.455633461475372e-05, |
| "learning_rate": 3.387377967463493e-07, |
| "loss": 0.0591, |
| "reward": -0.00648902915418148, |
| "reward_std": 0.3278183531947434, |
| "rewards/accuracy_reward": 0.10416666977107525, |
| "rewards/cosine_scaled_reward": -0.11065569380298257, |
| "step": 139 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3147.437530517578, |
| "epoch": 0.16, |
| "grad_norm": 0.0742841511964798, |
| "kl": 4.8547983169555664e-05, |
| "learning_rate": 3.3183567088914833e-07, |
| "loss": 0.0277, |
| "reward": 0.2934675266733393, |
| "reward_std": 0.46008346043527126, |
| "rewards/accuracy_reward": 0.2291666679084301, |
| "rewards/cosine_scaled_reward": 0.06430085934698582, |
| "step": 140 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3212.6458740234375, |
| "epoch": 0.16114285714285714, |
| "grad_norm": 0.05506473779678345, |
| "kl": 3.645569086074829e-05, |
| "learning_rate": 3.250000000000001e-07, |
| "loss": -0.0265, |
| "reward": 0.12175815179944038, |
| "reward_std": 0.5772441830486059, |
| "rewards/accuracy_reward": 0.18750000558793545, |
| "rewards/cosine_scaled_reward": -0.06574184074997902, |
| "step": 141 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2816.729232788086, |
| "epoch": 0.16228571428571428, |
| "grad_norm": 0.06666780263185501, |
| "kl": 4.5670196413993835e-05, |
| "learning_rate": 3.182328662904756e-07, |
| "loss": 0.0521, |
| "reward": 0.16211201017722487, |
| "reward_std": 0.5409660097211599, |
| "rewards/accuracy_reward": 0.22916667349636555, |
| "rewards/cosine_scaled_reward": -0.06705465726554394, |
| "step": 142 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2639.291702270508, |
| "epoch": 0.16342857142857142, |
| "grad_norm": 0.08498696982860565, |
| "kl": 4.427810199558735e-05, |
| "learning_rate": 3.115363310950578e-07, |
| "loss": 0.0497, |
| "reward": -0.27455265261232853, |
| "reward_std": 0.23319050949066877, |
| "rewards/accuracy_reward": 0.02083333395421505, |
| "rewards/cosine_scaled_reward": -0.2953859841218218, |
| "step": 143 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3100.437511444092, |
| "epoch": 0.16457142857142856, |
| "grad_norm": 0.07933609187602997, |
| "kl": 3.787130117416382e-05, |
| "learning_rate": 3.0491243424323783e-07, |
| "loss": -0.0187, |
| "reward": 0.18983200006186962, |
| "reward_std": 0.3597974181175232, |
| "rewards/accuracy_reward": 0.22916666977107525, |
| "rewards/cosine_scaled_reward": -0.03933467622846365, |
| "step": 144 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2241.604179382324, |
| "epoch": 0.1657142857142857, |
| "grad_norm": 0.09724698215723038, |
| "kl": 3.6539509892463684e-05, |
| "learning_rate": 2.9836319343816397e-07, |
| "loss": -0.0007, |
| "reward": 0.24542317104896938, |
| "reward_std": 0.4150366364046931, |
| "rewards/accuracy_reward": 0.29166667349636555, |
| "rewards/cosine_scaled_reward": -0.04624348785728216, |
| "step": 145 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2525.7292098999023, |
| "epoch": 0.16685714285714287, |
| "grad_norm": 0.06342972815036774, |
| "kl": 2.726912498474121e-05, |
| "learning_rate": 2.918906036420294e-07, |
| "loss": -0.0081, |
| "reward": -0.045263445004820824, |
| "reward_std": 0.40356126986443996, |
| "rewards/accuracy_reward": 0.1041666679084301, |
| "rewards/cosine_scaled_reward": -0.14943011198192835, |
| "step": 146 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3529.729217529297, |
| "epoch": 0.168, |
| "grad_norm": 0.0595870241522789, |
| "kl": 4.3898820877075195e-05, |
| "learning_rate": 2.854966364683872e-07, |
| "loss": 0.0118, |
| "reward": 0.07939862087368965, |
| "reward_std": 0.5240298006683588, |
| "rewards/accuracy_reward": 0.1875000037252903, |
| "rewards/cosine_scaled_reward": -0.10810139158274978, |
| "step": 147 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2771.041702270508, |
| "epoch": 0.16914285714285715, |
| "grad_norm": 0.0586538165807724, |
| "kl": 2.527981996536255e-05, |
| "learning_rate": 2.791832395815782e-07, |
| "loss": -0.0089, |
| "reward": 0.07017020601779222, |
| "reward_std": 0.2498982846736908, |
| "rewards/accuracy_reward": 0.14583333395421505, |
| "rewards/cosine_scaled_reward": -0.0756631288677454, |
| "step": 148 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2869.4375534057617, |
| "epoch": 0.1702857142857143, |
| "grad_norm": 0.07042869925498962, |
| "kl": 3.543868660926819e-05, |
| "learning_rate": 2.729523361034538e-07, |
| "loss": 0.0055, |
| "reward": 0.2183131380006671, |
| "reward_std": 0.27962948102504015, |
| "rewards/accuracy_reward": 0.2291666716337204, |
| "rewards/cosine_scaled_reward": -0.010853511281311512, |
| "step": 149 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2761.625030517578, |
| "epoch": 0.17142857142857143, |
| "grad_norm": 0.07804974168539047, |
| "kl": 4.456937313079834e-05, |
| "learning_rate": 2.6680582402757324e-07, |
| "loss": 0.0089, |
| "reward": -0.214060353115201, |
| "reward_std": 0.2980783907696605, |
| "rewards/accuracy_reward": 0.0416666679084301, |
| "rewards/cosine_scaled_reward": -0.25572701659984887, |
| "step": 150 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2819.5833740234375, |
| "epoch": 0.17257142857142857, |
| "grad_norm": 0.07131638377904892, |
| "kl": 4.2825937271118164e-05, |
| "learning_rate": 2.6074557564105724e-07, |
| "loss": -0.0174, |
| "reward": 0.032599929720163345, |
| "reward_std": 0.4547419548034668, |
| "rewards/accuracy_reward": 0.14583333767950535, |
| "rewards/cosine_scaled_reward": -0.1132334005087614, |
| "step": 151 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3040.833354949951, |
| "epoch": 0.1737142857142857, |
| "grad_norm": 0.10568142682313919, |
| "kl": 5.511939525604248e-05, |
| "learning_rate": 2.547734369542718e-07, |
| "loss": 0.0503, |
| "reward": -0.10969539848156273, |
| "reward_std": 0.40426155179739, |
| "rewards/accuracy_reward": 0.06250000186264515, |
| "rewards/cosine_scaled_reward": -0.1721954019740224, |
| "step": 152 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2793.0417251586914, |
| "epoch": 0.17485714285714285, |
| "grad_norm": 0.09443458169698715, |
| "kl": 4.933774471282959e-05, |
| "learning_rate": 2.488912271385139e-07, |
| "loss": 0.0244, |
| "reward": -0.0962764136493206, |
| "reward_std": 0.34763350896537304, |
| "rewards/accuracy_reward": 0.0833333358168602, |
| "rewards/cosine_scaled_reward": -0.17960975063033402, |
| "step": 153 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3132.8125610351562, |
| "epoch": 0.176, |
| "grad_norm": 0.05638415366411209, |
| "kl": 2.7516856789588928e-05, |
| "learning_rate": 2.4310073797187573e-07, |
| "loss": -0.0348, |
| "reward": 0.449524587020278, |
| "reward_std": 0.5751714678481221, |
| "rewards/accuracy_reward": 0.3541666753590107, |
| "rewards/cosine_scaled_reward": 0.09535788558423519, |
| "step": 154 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2529.104202270508, |
| "epoch": 0.17714285714285713, |
| "grad_norm": 0.08164854347705841, |
| "kl": 3.826734609901905e-05, |
| "learning_rate": 2.374037332934512e-07, |
| "loss": 0.0305, |
| "reward": 0.2544417988974601, |
| "reward_std": 0.46309633809141815, |
| "rewards/accuracy_reward": 0.25000000931322575, |
| "rewards/cosine_scaled_reward": 0.004441759781911969, |
| "step": 155 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2993.270866394043, |
| "epoch": 0.1782857142857143, |
| "grad_norm": 0.06250454485416412, |
| "kl": 3.37064266204834e-05, |
| "learning_rate": 2.3180194846605364e-07, |
| "loss": -0.0066, |
| "reward": -0.026538243517279625, |
| "reward_std": 0.4186902232468128, |
| "rewards/accuracy_reward": 0.125, |
| "rewards/cosine_scaled_reward": -0.15153825469315052, |
| "step": 156 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3177.8333435058594, |
| "epoch": 0.17942857142857144, |
| "grad_norm": 0.053174857050180435, |
| "kl": 3.8951635360717773e-05, |
| "learning_rate": 2.2629708984760706e-07, |
| "loss": 0.0154, |
| "reward": -0.12036301381886005, |
| "reward_std": 0.44441020861268044, |
| "rewards/accuracy_reward": 0.0833333358168602, |
| "rewards/cosine_scaled_reward": -0.20369636011309922, |
| "step": 157 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2696.041702270508, |
| "epoch": 0.18057142857142858, |
| "grad_norm": 0.06273730844259262, |
| "kl": 4.3585896492004395e-05, |
| "learning_rate": 2.2089083427137329e-07, |
| "loss": 0.0106, |
| "reward": 0.20966186001896858, |
| "reward_std": 0.24054066091775894, |
| "rewards/accuracy_reward": 0.2083333358168602, |
| "rewards/cosine_scaled_reward": 0.0013285204768180847, |
| "step": 158 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3259.7708435058594, |
| "epoch": 0.18171428571428572, |
| "grad_norm": 0.05220724642276764, |
| "kl": 2.981722354888916e-05, |
| "learning_rate": 2.1558482853517253e-07, |
| "loss": 0.0177, |
| "reward": -0.20584244281053543, |
| "reward_std": 0.28025806602090597, |
| "rewards/accuracy_reward": 0.0625, |
| "rewards/cosine_scaled_reward": -0.2683424372226, |
| "step": 159 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3075.4792098999023, |
| "epoch": 0.18285714285714286, |
| "grad_norm": 0.06919407844543457, |
| "kl": 5.105137825012207e-05, |
| "learning_rate": 2.1038068889975259e-07, |
| "loss": 0.0367, |
| "reward": -0.008766223094426095, |
| "reward_std": 0.4496794454753399, |
| "rewards/accuracy_reward": 0.1250000037252903, |
| "rewards/cosine_scaled_reward": -0.13376623298972845, |
| "step": 160 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2116.8958892822266, |
| "epoch": 0.184, |
| "grad_norm": 0.08072680234909058, |
| "kl": 4.120171070098877e-05, |
| "learning_rate": 2.0528000059645995e-07, |
| "loss": -0.056, |
| "reward": 0.15352233685553074, |
| "reward_std": 0.5114505719393492, |
| "rewards/accuracy_reward": 0.1875, |
| "rewards/cosine_scaled_reward": -0.03397766686975956, |
| "step": 161 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3499.2916870117188, |
| "epoch": 0.18514285714285714, |
| "grad_norm": 0.0527619905769825, |
| "kl": 3.422051668167114e-05, |
| "learning_rate": 2.0028431734436308e-07, |
| "loss": -0.0021, |
| "reward": -0.038823087234050035, |
| "reward_std": 0.4940961766988039, |
| "rewards/accuracy_reward": 0.12500000186264515, |
| "rewards/cosine_scaled_reward": -0.1638230886310339, |
| "step": 162 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2734.8333892822266, |
| "epoch": 0.18628571428571428, |
| "grad_norm": 0.07632818818092346, |
| "kl": 3.5293400287628174e-05, |
| "learning_rate": 1.9539516087697517e-07, |
| "loss": 0.0764, |
| "reward": 0.4233822599053383, |
| "reward_std": 0.4737155893817544, |
| "rewards/accuracy_reward": 0.3541666716337204, |
| "rewards/cosine_scaled_reward": 0.06921557523310184, |
| "step": 163 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2648.4375381469727, |
| "epoch": 0.18742857142857142, |
| "grad_norm": 0.08120272308588028, |
| "kl": 4.1447579860687256e-05, |
| "learning_rate": 1.9061402047871833e-07, |
| "loss": -0.0271, |
| "reward": 0.2842383498791605, |
| "reward_std": 0.5444171037524939, |
| "rewards/accuracy_reward": 0.25000000558793545, |
| "rewards/cosine_scaled_reward": 0.034238346852362156, |
| "step": 164 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3249.2083435058594, |
| "epoch": 0.18857142857142858, |
| "grad_norm": 0.06324990838766098, |
| "kl": 4.744529724121094e-05, |
| "learning_rate": 1.8594235253127372e-07, |
| "loss": -0.0558, |
| "reward": -0.2445811154320836, |
| "reward_std": 0.2507223319262266, |
| "rewards/accuracy_reward": 0.02083333395421505, |
| "rewards/cosine_scaled_reward": -0.2654144521802664, |
| "step": 165 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2949.8541717529297, |
| "epoch": 0.18971428571428572, |
| "grad_norm": 0.06484034657478333, |
| "kl": 4.217028617858887e-05, |
| "learning_rate": 1.8138158006995363e-07, |
| "loss": 0.0424, |
| "reward": -0.1878061555325985, |
| "reward_std": 0.31927773356437683, |
| "rewards/accuracy_reward": 0.0416666679084301, |
| "rewards/cosine_scaled_reward": -0.22947282809764147, |
| "step": 166 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2445.354202270508, |
| "epoch": 0.19085714285714286, |
| "grad_norm": 0.08651679754257202, |
| "kl": 3.0443072319030762e-05, |
| "learning_rate": 1.7693309235023127e-07, |
| "loss": -0.0164, |
| "reward": 0.3632579315453768, |
| "reward_std": 0.48152059130370617, |
| "rewards/accuracy_reward": 0.29166666977107525, |
| "rewards/cosine_scaled_reward": 0.07159125246107578, |
| "step": 167 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3308.1458740234375, |
| "epoch": 0.192, |
| "grad_norm": 0.05902985855937004, |
| "kl": 4.501640796661377e-05, |
| "learning_rate": 1.7259824442455923e-07, |
| "loss": 0.014, |
| "reward": 0.1645604595541954, |
| "reward_std": 0.5049289520829916, |
| "rewards/accuracy_reward": 0.1875000074505806, |
| "rewards/cosine_scaled_reward": -0.02293952787294984, |
| "step": 168 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2684.062545776367, |
| "epoch": 0.19314285714285714, |
| "grad_norm": 0.055804040282964706, |
| "kl": 3.5997480154037476e-05, |
| "learning_rate": 1.6837835672960831e-07, |
| "loss": -0.0215, |
| "reward": 0.6388495257124305, |
| "reward_std": 0.4502353947609663, |
| "rewards/accuracy_reward": 0.47916667722165585, |
| "rewards/cosine_scaled_reward": 0.15968285594135523, |
| "step": 169 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2689.9167404174805, |
| "epoch": 0.19428571428571428, |
| "grad_norm": 0.07801216095685959, |
| "kl": 2.8800219297409058e-05, |
| "learning_rate": 1.6427471468404952e-07, |
| "loss": -0.0261, |
| "reward": 0.21209237910807133, |
| "reward_std": 0.18574862275272608, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/cosine_scaled_reward": -0.037907619029283524, |
| "step": 170 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2724.9583435058594, |
| "epoch": 0.19542857142857142, |
| "grad_norm": 0.08230967074632645, |
| "kl": 4.396587610244751e-05, |
| "learning_rate": 1.6028856829700258e-07, |
| "loss": 0.1104, |
| "reward": 0.2708106115460396, |
| "reward_std": 0.4123786874115467, |
| "rewards/accuracy_reward": 0.2500000111758709, |
| "rewards/cosine_scaled_reward": 0.020810591988265514, |
| "step": 171 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2835.2291870117188, |
| "epoch": 0.19657142857142856, |
| "grad_norm": 0.0861576646566391, |
| "kl": 4.382431507110596e-05, |
| "learning_rate": 1.5642113178727193e-07, |
| "loss": -0.0178, |
| "reward": 0.3303783554583788, |
| "reward_std": 0.26864094100892544, |
| "rewards/accuracy_reward": 0.2916666679084301, |
| "rewards/cosine_scaled_reward": 0.03871168568730354, |
| "step": 172 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2145.4791717529297, |
| "epoch": 0.1977142857142857, |
| "grad_norm": 0.11213409900665283, |
| "kl": 2.8414186090230942e-05, |
| "learning_rate": 1.5267358321348285e-07, |
| "loss": 0.0598, |
| "reward": -0.1945440210402012, |
| "reward_std": 0.2684439942240715, |
| "rewards/accuracy_reward": 0.0416666679084301, |
| "rewards/cosine_scaled_reward": -0.236210685223341, |
| "step": 173 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2457.000045776367, |
| "epoch": 0.19885714285714284, |
| "grad_norm": 0.07888836413621902, |
| "kl": 4.423223435878754e-05, |
| "learning_rate": 1.4904706411523448e-07, |
| "loss": 0.0072, |
| "reward": 0.028682731091976166, |
| "reward_std": 0.3702436462044716, |
| "rewards/accuracy_reward": 0.1458333395421505, |
| "rewards/cosine_scaled_reward": -0.117150594945997, |
| "step": 174 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2821.541702270508, |
| "epoch": 0.2, |
| "grad_norm": 0.06768631935119629, |
| "kl": 3.556848969310522e-05, |
| "learning_rate": 1.4554267916537495e-07, |
| "loss": -0.0158, |
| "reward": 0.3141335854306817, |
| "reward_std": 0.3858940042555332, |
| "rewards/accuracy_reward": 0.25, |
| "rewards/cosine_scaled_reward": 0.06413357798010111, |
| "step": 175 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2702.479217529297, |
| "epoch": 0.20114285714285715, |
| "grad_norm": 0.09728685021400452, |
| "kl": 3.568828105926514e-05, |
| "learning_rate": 1.4216149583350755e-07, |
| "loss": 0.0125, |
| "reward": 0.4325672350823879, |
| "reward_std": 0.7816441245377064, |
| "rewards/accuracy_reward": 0.33333334140479565, |
| "rewards/cosine_scaled_reward": 0.09923387924209237, |
| "step": 176 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2954.2500534057617, |
| "epoch": 0.2022857142857143, |
| "grad_norm": 0.07380012422800064, |
| "kl": 5.640089511871338e-05, |
| "learning_rate": 1.3890454406082956e-07, |
| "loss": -0.038, |
| "reward": 0.03029090305790305, |
| "reward_std": 0.3226492116227746, |
| "rewards/accuracy_reward": 0.12500000558793545, |
| "rewards/cosine_scaled_reward": -0.0947091169655323, |
| "step": 177 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2251.062530517578, |
| "epoch": 0.20342857142857143, |
| "grad_norm": 0.09012287110090256, |
| "kl": 3.452599048614502e-05, |
| "learning_rate": 1.3577281594640182e-07, |
| "loss": 0.0711, |
| "reward": 0.21392884047236294, |
| "reward_std": 0.4973601717501879, |
| "rewards/accuracy_reward": 0.2083333395421505, |
| "rewards/cosine_scaled_reward": 0.0055954959243535995, |
| "step": 178 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2972.9375, |
| "epoch": 0.20457142857142857, |
| "grad_norm": 0.0639650970697403, |
| "kl": 3.37846577167511e-05, |
| "learning_rate": 1.3276726544494571e-07, |
| "loss": 0.0246, |
| "reward": -0.15617204643785954, |
| "reward_std": 0.33500426076352596, |
| "rewards/accuracy_reward": 0.06250000186264515, |
| "rewards/cosine_scaled_reward": -0.2186720399186015, |
| "step": 179 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2245.6666870117188, |
| "epoch": 0.2057142857142857, |
| "grad_norm": 0.11034268140792847, |
| "kl": 4.6290457248687744e-05, |
| "learning_rate": 1.2988880807625927e-07, |
| "loss": -0.0654, |
| "reward": 0.39349728263914585, |
| "reward_std": 0.5014538783580065, |
| "rewards/accuracy_reward": 0.3333333395421505, |
| "rewards/cosine_scaled_reward": 0.06016395008191466, |
| "step": 180 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3190.9791679382324, |
| "epoch": 0.20685714285714285, |
| "grad_norm": 0.07291311770677567, |
| "kl": 3.898981958627701e-05, |
| "learning_rate": 1.2713832064634125e-07, |
| "loss": 0.0136, |
| "reward": -0.06048195622861385, |
| "reward_std": 0.25065876357257366, |
| "rewards/accuracy_reward": 0.1041666716337204, |
| "rewards/cosine_scaled_reward": -0.16464862413704395, |
| "step": 181 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2276.5833587646484, |
| "epoch": 0.208, |
| "grad_norm": 0.07662935554981232, |
| "kl": 2.4404376745224e-05, |
| "learning_rate": 1.2451664098030743e-07, |
| "loss": -0.0495, |
| "reward": 0.17493994487449527, |
| "reward_std": 0.6582750072702765, |
| "rewards/accuracy_reward": 0.2083333395421505, |
| "rewards/cosine_scaled_reward": -0.03339339280501008, |
| "step": 182 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 1754.2708587646484, |
| "epoch": 0.20914285714285713, |
| "grad_norm": 0.11352153867483139, |
| "kl": 4.573538899421692e-05, |
| "learning_rate": 1.220245676671809e-07, |
| "loss": 0.0516, |
| "reward": 0.18977578077465296, |
| "reward_std": 0.6147185508161783, |
| "rewards/accuracy_reward": 0.2083333395421505, |
| "rewards/cosine_scaled_reward": -0.018557552015408874, |
| "step": 183 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3000.437530517578, |
| "epoch": 0.2102857142857143, |
| "grad_norm": 0.07938341796398163, |
| "kl": 4.5321881771087646e-05, |
| "learning_rate": 1.1966285981663407e-07, |
| "loss": -0.0348, |
| "reward": -0.057322083972394466, |
| "reward_std": 0.22675575967878103, |
| "rewards/accuracy_reward": 0.1041666716337204, |
| "rewards/cosine_scaled_reward": -0.16148875933140516, |
| "step": 184 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2710.31254196167, |
| "epoch": 0.21142857142857144, |
| "grad_norm": 0.11325710266828537, |
| "kl": 3.845244646072388e-05, |
| "learning_rate": 1.1743223682775649e-07, |
| "loss": 0.0851, |
| "reward": 0.04408662021160126, |
| "reward_std": 0.36012868769466877, |
| "rewards/accuracy_reward": 0.1458333358168602, |
| "rewards/cosine_scaled_reward": -0.10174670163542032, |
| "step": 185 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3046.1458435058594, |
| "epoch": 0.21257142857142858, |
| "grad_norm": 0.05957398936152458, |
| "kl": 3.740936517715454e-05, |
| "learning_rate": 1.1533337816991931e-07, |
| "loss": 0.0061, |
| "reward": 0.14046389423310757, |
| "reward_std": 0.48607588466256857, |
| "rewards/accuracy_reward": 0.1875000037252903, |
| "rewards/cosine_scaled_reward": -0.04703611321747303, |
| "step": 186 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2773.4167098999023, |
| "epoch": 0.21371428571428572, |
| "grad_norm": 0.08205266296863556, |
| "kl": 4.443526268005371e-05, |
| "learning_rate": 1.1336692317580158e-07, |
| "loss": 0.0641, |
| "reward": -0.12049626559019089, |
| "reward_std": 0.27329744305461645, |
| "rewards/accuracy_reward": 0.0833333358168602, |
| "rewards/cosine_scaled_reward": -0.2038295976817608, |
| "step": 187 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3556.8125, |
| "epoch": 0.21485714285714286, |
| "grad_norm": 0.04973941296339035, |
| "kl": 4.267692565917969e-05, |
| "learning_rate": 1.1153347084664419e-07, |
| "loss": -0.0033, |
| "reward": -0.09900977090001106, |
| "reward_std": 0.36740162037312984, |
| "rewards/accuracy_reward": 0.06250000186264515, |
| "rewards/cosine_scaled_reward": -0.1615097806788981, |
| "step": 188 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2301.937515258789, |
| "epoch": 0.216, |
| "grad_norm": 0.08138614147901535, |
| "kl": 4.2557716369628906e-05, |
| "learning_rate": 1.0983357966978745e-07, |
| "loss": -0.0459, |
| "reward": -0.12572868075221777, |
| "reward_std": 0.2826405204832554, |
| "rewards/accuracy_reward": 0.0416666679084301, |
| "rewards/cosine_scaled_reward": -0.16739534121006727, |
| "step": 189 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3100.750030517578, |
| "epoch": 0.21714285714285714, |
| "grad_norm": 0.054819922894239426, |
| "kl": 2.2755935788154602e-05, |
| "learning_rate": 1.0826776744855121e-07, |
| "loss": -0.0095, |
| "reward": 0.31759869679808617, |
| "reward_std": 0.4463086621835828, |
| "rewards/accuracy_reward": 0.27083333767950535, |
| "rewards/cosine_scaled_reward": 0.04676533304154873, |
| "step": 190 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2485.2916870117188, |
| "epoch": 0.21828571428571428, |
| "grad_norm": 0.07485377788543701, |
| "kl": 4.908442497253418e-05, |
| "learning_rate": 1.068365111445064e-07, |
| "loss": 0.0541, |
| "reward": 0.18139376863837242, |
| "reward_std": 0.3567335680127144, |
| "rewards/accuracy_reward": 0.20833333395421505, |
| "rewards/cosine_scaled_reward": -0.026939572766423225, |
| "step": 191 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3492.1458740234375, |
| "epoch": 0.21942857142857142, |
| "grad_norm": 0.05016130581498146, |
| "kl": 3.1597912311553955e-05, |
| "learning_rate": 1.0554024673218806e-07, |
| "loss": -0.0201, |
| "reward": -0.03553268779069185, |
| "reward_std": 0.31929558888077736, |
| "rewards/accuracy_reward": 0.12500000558793545, |
| "rewards/cosine_scaled_reward": -0.16053267009556293, |
| "step": 192 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2811.1250228881836, |
| "epoch": 0.22057142857142858, |
| "grad_norm": 0.07492359727621078, |
| "kl": 3.6617740988731384e-05, |
| "learning_rate": 1.0437936906629334e-07, |
| "loss": 0.0277, |
| "reward": 0.020672958344221115, |
| "reward_std": 0.48316178657114506, |
| "rewards/accuracy_reward": 0.14583333395421505, |
| "rewards/cosine_scaled_reward": -0.12516038585454226, |
| "step": 193 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3140.041732788086, |
| "epoch": 0.22171428571428572, |
| "grad_norm": 0.05980326607823372, |
| "kl": 4.427880048751831e-05, |
| "learning_rate": 1.0335423176140511e-07, |
| "loss": -0.0036, |
| "reward": 0.5903252474963665, |
| "reward_std": 0.5727798100560904, |
| "rewards/accuracy_reward": 0.4166666753590107, |
| "rewards/cosine_scaled_reward": 0.17365856003016233, |
| "step": 194 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2799.9791717529297, |
| "epoch": 0.22285714285714286, |
| "grad_norm": 0.059899453073740005, |
| "kl": 4.7072768211364746e-05, |
| "learning_rate": 1.0246514708427701e-07, |
| "loss": 0.0319, |
| "reward": -0.16166013106703758, |
| "reward_std": 0.2635766211897135, |
| "rewards/accuracy_reward": 0.0416666679084301, |
| "rewards/cosine_scaled_reward": -0.20332680083811283, |
| "step": 195 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3475.625030517578, |
| "epoch": 0.224, |
| "grad_norm": 0.05154493823647499, |
| "kl": 4.3954700231552124e-05, |
| "learning_rate": 1.017123858587145e-07, |
| "loss": 0.0012, |
| "reward": -0.05280750431120396, |
| "reward_std": 0.37235557474195957, |
| "rewards/accuracy_reward": 0.10416666977107525, |
| "rewards/cosine_scaled_reward": -0.1569741666316986, |
| "step": 196 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2883.5416870117188, |
| "epoch": 0.22514285714285714, |
| "grad_norm": 0.08069178462028503, |
| "kl": 4.8279762268066406e-05, |
| "learning_rate": 1.0109617738307911e-07, |
| "loss": -0.0765, |
| "reward": 0.35078890819568187, |
| "reward_std": 0.5930481739342213, |
| "rewards/accuracy_reward": 0.2916666716337204, |
| "rewards/cosine_scaled_reward": 0.05912225344218314, |
| "step": 197 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2874.5625076293945, |
| "epoch": 0.22628571428571428, |
| "grad_norm": 0.08328583091497421, |
| "kl": 4.653632640838623e-05, |
| "learning_rate": 1.0061670936044178e-07, |
| "loss": 0.0162, |
| "reward": 0.08681692741811275, |
| "reward_std": 0.3849259242415428, |
| "rewards/accuracy_reward": 0.1666666716337204, |
| "rewards/cosine_scaled_reward": -0.07984975911676884, |
| "step": 198 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 3574.8958435058594, |
| "epoch": 0.22742857142857142, |
| "grad_norm": 0.049314841628074646, |
| "kl": 4.60892915725708e-05, |
| "learning_rate": 1.002741278414069e-07, |
| "loss": 0.0031, |
| "reward": -0.26533746905624866, |
| "reward_std": 0.16454033181071281, |
| "rewards/accuracy_reward": 0.0, |
| "rewards/cosine_scaled_reward": -0.2653374746441841, |
| "step": 199 |
| }, |
| { |
| "clip_fraction": 0.0, |
| "completion_length": 2369.1667098999023, |
| "epoch": 0.22857142857142856, |
| "grad_norm": 0.07246367633342743, |
| "kl": 3.355741500854492e-05, |
| "learning_rate": 1.0006853717962393e-07, |
| "loss": 0.0212, |
| "reward": 0.30690951086580753, |
| "reward_std": 0.45345655642449856, |
| "rewards/accuracy_reward": 0.2500000037252903, |
| "rewards/cosine_scaled_reward": 0.05690951179713011, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.22857142857142856, |
| "step": 200, |
| "total_flos": 0.0, |
| "train_loss": 0.0035571102210087704, |
| "train_runtime": 35198.8637, |
| "train_samples_per_second": 0.273, |
| "train_steps_per_second": 0.006 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 200, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 10, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 6, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|