| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.5714285714285714, | |
| "eval_steps": 500, | |
| "global_step": 500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "completion_length": 2253.854206085205, | |
| "epoch": 0.001142857142857143, | |
| "grad_norm": 0.028186120092868805, | |
| "kl": 0.0, | |
| "learning_rate": 0.0, | |
| "loss": 0.0, | |
| "reward": 0.945640604943037, | |
| "reward_std": 0.7231155578047037, | |
| "rewards/cosine_scaled_reward": 0.16032031644135714, | |
| "rewards/format_reward": 0.6250000037252903, | |
| "step": 1 | |
| }, | |
| { | |
| "completion_length": 2566.395854949951, | |
| "epoch": 0.002285714285714286, | |
| "grad_norm": 0.02438599243760109, | |
| "kl": 0.0, | |
| "learning_rate": 2e-08, | |
| "loss": 0.0, | |
| "reward": 0.7753396667540073, | |
| "reward_std": 0.8076020702719688, | |
| "rewards/cosine_scaled_reward": 0.12725313939154148, | |
| "rewards/format_reward": 0.5208333414047956, | |
| "step": 2 | |
| }, | |
| { | |
| "completion_length": 2850.8333587646484, | |
| "epoch": 0.0034285714285714284, | |
| "grad_norm": 0.026358311995863914, | |
| "kl": 0.00019162893295288086, | |
| "learning_rate": 4e-08, | |
| "loss": 0.0, | |
| "reward": 0.25994101725518703, | |
| "reward_std": 0.7457061782479286, | |
| "rewards/cosine_scaled_reward": -0.07836283510550857, | |
| "rewards/format_reward": 0.4166666753590107, | |
| "step": 3 | |
| }, | |
| { | |
| "completion_length": 1333.3750228881836, | |
| "epoch": 0.004571428571428572, | |
| "grad_norm": 0.03962448611855507, | |
| "kl": 8.179247379302979e-05, | |
| "learning_rate": 6e-08, | |
| "loss": 0.0, | |
| "reward": 1.1095947362482548, | |
| "reward_std": 0.7611480094492435, | |
| "rewards/cosine_scaled_reward": 0.10688067661249079, | |
| "rewards/format_reward": 0.895833333954215, | |
| "step": 4 | |
| }, | |
| { | |
| "completion_length": 3136.312530517578, | |
| "epoch": 0.005714285714285714, | |
| "grad_norm": 0.02204746939241886, | |
| "kl": 0.00017178058624267578, | |
| "learning_rate": 8e-08, | |
| "loss": 0.0, | |
| "reward": 0.3529331162571907, | |
| "reward_std": 0.9226996600627899, | |
| "rewards/cosine_scaled_reward": -0.07353345490992069, | |
| "rewards/format_reward": 0.5000000093132257, | |
| "step": 5 | |
| }, | |
| { | |
| "completion_length": 2268.208351135254, | |
| "epoch": 0.006857142857142857, | |
| "grad_norm": 0.02553640492260456, | |
| "kl": 0.00011495500802993774, | |
| "learning_rate": 1e-07, | |
| "loss": 0.0, | |
| "reward": 0.47384920963668264, | |
| "reward_std": 0.9380720984190702, | |
| "rewards/cosine_scaled_reward": -0.08599207969382405, | |
| "rewards/format_reward": 0.6458333432674408, | |
| "step": 6 | |
| }, | |
| { | |
| "completion_length": 2404.916702270508, | |
| "epoch": 0.008, | |
| "grad_norm": 0.02169734612107277, | |
| "kl": 0.0001201927661895752, | |
| "learning_rate": 1.2e-07, | |
| "loss": 0.0, | |
| "reward": 0.7995812203735113, | |
| "reward_std": 0.7245029136538506, | |
| "rewards/cosine_scaled_reward": 0.00395727576687932, | |
| "rewards/format_reward": 0.7916666716337204, | |
| "step": 7 | |
| }, | |
| { | |
| "completion_length": 1825.270881652832, | |
| "epoch": 0.009142857142857144, | |
| "grad_norm": 0.03927920013666153, | |
| "kl": 6.420910358428955e-05, | |
| "learning_rate": 1.4e-07, | |
| "loss": 0.0, | |
| "reward": 1.2986854314804077, | |
| "reward_std": 0.694359702989459, | |
| "rewards/cosine_scaled_reward": 0.2951760431751609, | |
| "rewards/format_reward": 0.7083333414047956, | |
| "step": 8 | |
| }, | |
| { | |
| "completion_length": 2716.9375, | |
| "epoch": 0.010285714285714285, | |
| "grad_norm": 0.031250353902578354, | |
| "kl": 0.00015756487846374512, | |
| "learning_rate": 1.6e-07, | |
| "loss": 0.0, | |
| "reward": 0.4805310983210802, | |
| "reward_std": 0.6337715331465006, | |
| "rewards/cosine_scaled_reward": 0.021515536587685347, | |
| "rewards/format_reward": 0.4375, | |
| "step": 9 | |
| }, | |
| { | |
| "completion_length": 2326.1458625793457, | |
| "epoch": 0.011428571428571429, | |
| "grad_norm": 0.02834513410925865, | |
| "kl": 0.00010342895984649658, | |
| "learning_rate": 1.8e-07, | |
| "loss": 0.0, | |
| "reward": 0.7450611963868141, | |
| "reward_std": 0.7835288420319557, | |
| "rewards/cosine_scaled_reward": 0.09128060017246753, | |
| "rewards/format_reward": 0.5625000093132257, | |
| "step": 10 | |
| }, | |
| { | |
| "completion_length": 3093.3541870117188, | |
| "epoch": 0.012571428571428572, | |
| "grad_norm": 0.023482978343963623, | |
| "kl": 0.00014767050743103027, | |
| "learning_rate": 2e-07, | |
| "loss": 0.0, | |
| "reward": 0.3454847726970911, | |
| "reward_std": 0.5903383195400238, | |
| "rewards/cosine_scaled_reward": -0.0043409522622823715, | |
| "rewards/format_reward": 0.35416666977107525, | |
| "step": 11 | |
| }, | |
| { | |
| "completion_length": 2152.541748046875, | |
| "epoch": 0.013714285714285714, | |
| "grad_norm": 0.028229378163814545, | |
| "kl": 0.0001628100872039795, | |
| "learning_rate": 2.1999999999999998e-07, | |
| "loss": 0.0, | |
| "reward": 0.7333892993628979, | |
| "reward_std": 0.9000732153654099, | |
| "rewards/cosine_scaled_reward": 0.0437779575586319, | |
| "rewards/format_reward": 0.6458333488553762, | |
| "step": 12 | |
| }, | |
| { | |
| "completion_length": 2562.145866394043, | |
| "epoch": 0.014857142857142857, | |
| "grad_norm": 0.022799858823418617, | |
| "kl": 0.00014079362154006958, | |
| "learning_rate": 2.4e-07, | |
| "loss": 0.0, | |
| "reward": 0.5581501871347427, | |
| "reward_std": 0.8053405769169331, | |
| "rewards/cosine_scaled_reward": 0.018658424261957407, | |
| "rewards/format_reward": 0.5208333432674408, | |
| "step": 13 | |
| }, | |
| { | |
| "completion_length": 2327.166679382324, | |
| "epoch": 0.016, | |
| "grad_norm": 0.025850726291537285, | |
| "kl": 0.00013312697410583496, | |
| "learning_rate": 2.6e-07, | |
| "loss": 0.0, | |
| "reward": 0.4661823809146881, | |
| "reward_std": 0.7131906598806381, | |
| "rewards/cosine_scaled_reward": -0.04815882258117199, | |
| "rewards/format_reward": 0.5625000018626451, | |
| "step": 14 | |
| }, | |
| { | |
| "completion_length": 2491.4791717529297, | |
| "epoch": 0.017142857142857144, | |
| "grad_norm": 0.025151003152132034, | |
| "kl": 0.00011346954852342606, | |
| "learning_rate": 2.8e-07, | |
| "loss": 0.0, | |
| "reward": 0.5728016346693039, | |
| "reward_std": 0.4929979182779789, | |
| "rewards/cosine_scaled_reward": 0.036400796845555305, | |
| "rewards/format_reward": 0.5000000055879354, | |
| "step": 15 | |
| }, | |
| { | |
| "completion_length": 3466.8958740234375, | |
| "epoch": 0.018285714285714287, | |
| "grad_norm": 0.01853562705218792, | |
| "kl": 0.00017964839935302734, | |
| "learning_rate": 3e-07, | |
| "loss": 0.0, | |
| "reward": 0.04977674409747124, | |
| "reward_std": 0.8271799795329571, | |
| "rewards/cosine_scaled_reward": -0.06886161956936121, | |
| "rewards/format_reward": 0.1875, | |
| "step": 16 | |
| }, | |
| { | |
| "completion_length": 1964.208366394043, | |
| "epoch": 0.019428571428571427, | |
| "grad_norm": 0.039788853377103806, | |
| "kl": 0.00011803209781646729, | |
| "learning_rate": 3.2e-07, | |
| "loss": 0.0, | |
| "reward": 0.9614612013101578, | |
| "reward_std": 0.5819262592121959, | |
| "rewards/cosine_scaled_reward": 0.1786472648382187, | |
| "rewards/format_reward": 0.6041666716337204, | |
| "step": 17 | |
| }, | |
| { | |
| "completion_length": 2595.437545776367, | |
| "epoch": 0.02057142857142857, | |
| "grad_norm": 0.01998710259795189, | |
| "kl": 0.00012928247451782227, | |
| "learning_rate": 3.4000000000000003e-07, | |
| "loss": 0.0, | |
| "reward": 0.7032174468040466, | |
| "reward_std": 0.616347186267376, | |
| "rewards/cosine_scaled_reward": 0.03910870919935405, | |
| "rewards/format_reward": 0.625, | |
| "step": 18 | |
| }, | |
| { | |
| "completion_length": 2346.3542098999023, | |
| "epoch": 0.021714285714285714, | |
| "grad_norm": 0.02497478388249874, | |
| "kl": 0.00013540685176849365, | |
| "learning_rate": 3.6e-07, | |
| "loss": 0.0, | |
| "reward": 1.3847788944840431, | |
| "reward_std": 1.1934307999908924, | |
| "rewards/cosine_scaled_reward": 0.3382227774709463, | |
| "rewards/format_reward": 0.7083333414047956, | |
| "step": 19 | |
| }, | |
| { | |
| "completion_length": 1532.4166984558105, | |
| "epoch": 0.022857142857142857, | |
| "grad_norm": 0.030825506895780563, | |
| "kl": 7.693842053413391e-05, | |
| "learning_rate": 3.7999999999999996e-07, | |
| "loss": 0.0, | |
| "reward": 1.009512271732092, | |
| "reward_std": 0.6844168081879616, | |
| "rewards/cosine_scaled_reward": 0.04642279585823417, | |
| "rewards/format_reward": 0.916666679084301, | |
| "step": 20 | |
| }, | |
| { | |
| "completion_length": 2422.31254196167, | |
| "epoch": 0.024, | |
| "grad_norm": 0.032373763620853424, | |
| "kl": 0.0001367330551147461, | |
| "learning_rate": 4e-07, | |
| "loss": 0.0, | |
| "reward": 0.6270023807883263, | |
| "reward_std": 0.6962997727096081, | |
| "rewards/cosine_scaled_reward": 0.053084509272594005, | |
| "rewards/format_reward": 0.5208333395421505, | |
| "step": 21 | |
| }, | |
| { | |
| "completion_length": 1439.500057220459, | |
| "epoch": 0.025142857142857144, | |
| "grad_norm": 0.03851747140288353, | |
| "kl": 0.00011110305786132812, | |
| "learning_rate": 4.1999999999999995e-07, | |
| "loss": 0.0, | |
| "reward": 0.7128502167761326, | |
| "reward_std": 0.4558130237273872, | |
| "rewards/cosine_scaled_reward": -0.09149156883358955, | |
| "rewards/format_reward": 0.8958333358168602, | |
| "step": 22 | |
| }, | |
| { | |
| "completion_length": 2568.8750534057617, | |
| "epoch": 0.026285714285714287, | |
| "grad_norm": 0.027865994721651077, | |
| "kl": 0.0001246929168701172, | |
| "learning_rate": 4.3999999999999997e-07, | |
| "loss": 0.0, | |
| "reward": 0.39115861523896456, | |
| "reward_std": 0.9601399153470993, | |
| "rewards/cosine_scaled_reward": -0.05442069785203785, | |
| "rewards/format_reward": 0.5000000074505806, | |
| "step": 23 | |
| }, | |
| { | |
| "completion_length": 1739.6875534057617, | |
| "epoch": 0.027428571428571427, | |
| "grad_norm": 0.02797676995396614, | |
| "kl": 7.119029760360718e-05, | |
| "learning_rate": 4.6e-07, | |
| "loss": 0.0, | |
| "reward": 0.8130059950053692, | |
| "reward_std": 0.8428283482789993, | |
| "rewards/cosine_scaled_reward": 0.02108631655573845, | |
| "rewards/format_reward": 0.770833345130086, | |
| "step": 24 | |
| }, | |
| { | |
| "completion_length": 2243.5625381469727, | |
| "epoch": 0.02857142857142857, | |
| "grad_norm": 0.02834572270512581, | |
| "kl": 0.00014713406562805176, | |
| "learning_rate": 4.8e-07, | |
| "loss": 0.0, | |
| "reward": 0.7691881991922855, | |
| "reward_std": 0.6148425601422787, | |
| "rewards/cosine_scaled_reward": 0.10334407165646553, | |
| "rewards/format_reward": 0.5625000018626451, | |
| "step": 25 | |
| }, | |
| { | |
| "completion_length": 2384.479202270508, | |
| "epoch": 0.029714285714285714, | |
| "grad_norm": 0.022393781691789627, | |
| "kl": 0.00015068799257278442, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0, | |
| "reward": 0.4031391516327858, | |
| "reward_std": 0.6596824564039707, | |
| "rewards/cosine_scaled_reward": -0.06926377443596721, | |
| "rewards/format_reward": 0.5416666679084301, | |
| "step": 26 | |
| }, | |
| { | |
| "completion_length": 2295.479202270508, | |
| "epoch": 0.030857142857142857, | |
| "grad_norm": 0.027790380641818047, | |
| "kl": 0.0001430213451385498, | |
| "learning_rate": 5.2e-07, | |
| "loss": 0.0, | |
| "reward": 0.7962075062096119, | |
| "reward_std": 0.5945973135530949, | |
| "rewards/cosine_scaled_reward": 0.0439370833337307, | |
| "rewards/format_reward": 0.7083333358168602, | |
| "step": 27 | |
| }, | |
| { | |
| "completion_length": 2273.5000534057617, | |
| "epoch": 0.032, | |
| "grad_norm": 0.03035018779337406, | |
| "kl": 0.00013148784637451172, | |
| "learning_rate": 5.4e-07, | |
| "loss": 0.0, | |
| "reward": 0.9480000095209107, | |
| "reward_std": 0.969454187899828, | |
| "rewards/cosine_scaled_reward": 0.15108334203250706, | |
| "rewards/format_reward": 0.6458333358168602, | |
| "step": 28 | |
| }, | |
| { | |
| "completion_length": 2891.291748046875, | |
| "epoch": 0.03314285714285714, | |
| "grad_norm": 0.024835970252752304, | |
| "kl": 0.00016313791275024414, | |
| "learning_rate": 5.6e-07, | |
| "loss": 0.0, | |
| "reward": 0.31715210899710655, | |
| "reward_std": 0.7158625386655331, | |
| "rewards/cosine_scaled_reward": -0.060173945501446724, | |
| "rewards/format_reward": 0.43750000931322575, | |
| "step": 29 | |
| }, | |
| { | |
| "completion_length": 2167.875045776367, | |
| "epoch": 0.03428571428571429, | |
| "grad_norm": 0.023409895598888397, | |
| "kl": 0.00010243058204650879, | |
| "learning_rate": 5.8e-07, | |
| "loss": 0.0, | |
| "reward": 1.1814709827303886, | |
| "reward_std": 0.9771759286522865, | |
| "rewards/cosine_scaled_reward": 0.22615211736410856, | |
| "rewards/format_reward": 0.7291666865348816, | |
| "step": 30 | |
| }, | |
| { | |
| "completion_length": 2741.916702270508, | |
| "epoch": 0.03542857142857143, | |
| "grad_norm": 0.020507322624325752, | |
| "kl": 0.00014102458953857422, | |
| "learning_rate": 6e-07, | |
| "loss": 0.0, | |
| "reward": 0.5180935077369213, | |
| "reward_std": 0.8897004313766956, | |
| "rewards/cosine_scaled_reward": 0.019463416654616594, | |
| "rewards/format_reward": 0.4791666716337204, | |
| "step": 31 | |
| }, | |
| { | |
| "completion_length": 2284.687545776367, | |
| "epoch": 0.036571428571428574, | |
| "grad_norm": 0.02322392910718918, | |
| "kl": 0.00012177228927612305, | |
| "learning_rate": 6.2e-07, | |
| "loss": 0.0, | |
| "reward": 1.1874850988388062, | |
| "reward_std": 0.6209478713572025, | |
| "rewards/cosine_scaled_reward": 0.23957587592303753, | |
| "rewards/format_reward": 0.7083333395421505, | |
| "step": 32 | |
| }, | |
| { | |
| "completion_length": 2879.3750610351562, | |
| "epoch": 0.037714285714285714, | |
| "grad_norm": 0.019906090572476387, | |
| "kl": 0.00011980533599853516, | |
| "learning_rate": 6.4e-07, | |
| "loss": 0.0, | |
| "reward": 0.8293444700539112, | |
| "reward_std": 0.8636636873707175, | |
| "rewards/cosine_scaled_reward": 0.11258888803422451, | |
| "rewards/format_reward": 0.604166679084301, | |
| "step": 33 | |
| }, | |
| { | |
| "completion_length": 1965.4375076293945, | |
| "epoch": 0.038857142857142854, | |
| "grad_norm": 0.02797047235071659, | |
| "kl": 0.0001359265297651291, | |
| "learning_rate": 6.6e-07, | |
| "loss": 0.0, | |
| "reward": 1.257663020864129, | |
| "reward_std": 0.8084600828588009, | |
| "rewards/cosine_scaled_reward": 0.2746648136526346, | |
| "rewards/format_reward": 0.7083333414047956, | |
| "step": 34 | |
| }, | |
| { | |
| "completion_length": 2429.000045776367, | |
| "epoch": 0.04, | |
| "grad_norm": 0.03363242745399475, | |
| "kl": 0.0001286640763282776, | |
| "learning_rate": 6.800000000000001e-07, | |
| "loss": 0.0, | |
| "reward": 0.9006999442353845, | |
| "reward_std": 0.8478060588240623, | |
| "rewards/cosine_scaled_reward": 0.15868327533826232, | |
| "rewards/format_reward": 0.5833333395421505, | |
| "step": 35 | |
| }, | |
| { | |
| "completion_length": 2975.083366394043, | |
| "epoch": 0.04114285714285714, | |
| "grad_norm": 0.029229290783405304, | |
| "kl": 0.00017908215522766113, | |
| "learning_rate": 7e-07, | |
| "loss": 0.0, | |
| "reward": 0.09833091939799488, | |
| "reward_std": 0.9010875336825848, | |
| "rewards/cosine_scaled_reward": -0.14875120925717056, | |
| "rewards/format_reward": 0.3958333432674408, | |
| "step": 36 | |
| }, | |
| { | |
| "completion_length": 2954.0833435058594, | |
| "epoch": 0.04228571428571429, | |
| "grad_norm": 0.019327132031321526, | |
| "kl": 0.0001398026943206787, | |
| "learning_rate": 7.2e-07, | |
| "loss": 0.0, | |
| "reward": 0.2552360990084708, | |
| "reward_std": 0.6516154278069735, | |
| "rewards/cosine_scaled_reward": -0.04946526139974594, | |
| "rewards/format_reward": 0.3541666679084301, | |
| "step": 37 | |
| }, | |
| { | |
| "completion_length": 3235.125030517578, | |
| "epoch": 0.04342857142857143, | |
| "grad_norm": 0.019981542602181435, | |
| "kl": 0.0001823529601097107, | |
| "learning_rate": 7.4e-07, | |
| "loss": 0.0, | |
| "reward": 0.13104849308729172, | |
| "reward_std": 0.4905168265104294, | |
| "rewards/cosine_scaled_reward": -0.03864241763949394, | |
| "rewards/format_reward": 0.2083333358168602, | |
| "step": 38 | |
| }, | |
| { | |
| "completion_length": 2411.5417289733887, | |
| "epoch": 0.044571428571428574, | |
| "grad_norm": 0.026408830657601357, | |
| "kl": 0.00011394917964935303, | |
| "learning_rate": 7.599999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 0.5625039599835873, | |
| "reward_std": 0.3937423247843981, | |
| "rewards/cosine_scaled_reward": -0.020831378176808357, | |
| "rewards/format_reward": 0.6041666716337204, | |
| "step": 39 | |
| }, | |
| { | |
| "completion_length": 2098.062545776367, | |
| "epoch": 0.045714285714285714, | |
| "grad_norm": 0.02573023922741413, | |
| "kl": 0.00011499226093292236, | |
| "learning_rate": 7.799999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 0.657089076936245, | |
| "reward_std": 0.7896895445883274, | |
| "rewards/cosine_scaled_reward": -0.025622128508985043, | |
| "rewards/format_reward": 0.7083333469927311, | |
| "step": 40 | |
| }, | |
| { | |
| "completion_length": 2488.229217529297, | |
| "epoch": 0.046857142857142854, | |
| "grad_norm": 0.02382344752550125, | |
| "kl": 0.00011301040649414062, | |
| "learning_rate": 8e-07, | |
| "loss": 0.0, | |
| "reward": 0.9531193277798593, | |
| "reward_std": 0.625364888459444, | |
| "rewards/cosine_scaled_reward": 0.15364299900829792, | |
| "rewards/format_reward": 0.6458333432674408, | |
| "step": 41 | |
| }, | |
| { | |
| "completion_length": 2591.5416984558105, | |
| "epoch": 0.048, | |
| "grad_norm": 0.0474100187420845, | |
| "kl": 0.00018638372421264648, | |
| "learning_rate": 8.199999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 0.019108548760414124, | |
| "reward_std": 0.5053375829011202, | |
| "rewards/cosine_scaled_reward": -0.20919574238359928, | |
| "rewards/format_reward": 0.4375, | |
| "step": 42 | |
| }, | |
| { | |
| "completion_length": 2745.6250610351562, | |
| "epoch": 0.04914285714285714, | |
| "grad_norm": 0.02300211787223816, | |
| "kl": 0.00014187395572662354, | |
| "learning_rate": 8.399999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 0.3337301954161376, | |
| "reward_std": 1.0558787789195776, | |
| "rewards/cosine_scaled_reward": -0.04146823566406965, | |
| "rewards/format_reward": 0.416666679084301, | |
| "step": 43 | |
| }, | |
| { | |
| "completion_length": 2167.33341217041, | |
| "epoch": 0.05028571428571429, | |
| "grad_norm": 0.038022760301828384, | |
| "kl": 0.00011192262172698975, | |
| "learning_rate": 8.599999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 1.0215463279746473, | |
| "reward_std": 0.37458088528364897, | |
| "rewards/cosine_scaled_reward": 0.14618983678519726, | |
| "rewards/format_reward": 0.7291666697710752, | |
| "step": 44 | |
| }, | |
| { | |
| "completion_length": 3106.7083740234375, | |
| "epoch": 0.05142857142857143, | |
| "grad_norm": 0.02269629016518593, | |
| "kl": 0.00013549625873565674, | |
| "learning_rate": 8.799999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 0.7414790373295546, | |
| "reward_std": 0.6533729061484337, | |
| "rewards/cosine_scaled_reward": 0.11032285634428263, | |
| "rewards/format_reward": 0.5208333432674408, | |
| "step": 45 | |
| }, | |
| { | |
| "completion_length": 2975.0208587646484, | |
| "epoch": 0.052571428571428575, | |
| "grad_norm": 0.023037103936076164, | |
| "kl": 0.0001526474952697754, | |
| "learning_rate": 9e-07, | |
| "loss": 0.0, | |
| "reward": 0.13235431909561157, | |
| "reward_std": 0.6402451954782009, | |
| "rewards/cosine_scaled_reward": -0.15257284580729902, | |
| "rewards/format_reward": 0.43750000558793545, | |
| "step": 46 | |
| }, | |
| { | |
| "completion_length": 2202.0417098999023, | |
| "epoch": 0.053714285714285714, | |
| "grad_norm": 0.03050912357866764, | |
| "kl": 9.85860824584961e-05, | |
| "learning_rate": 9.2e-07, | |
| "loss": 0.0, | |
| "reward": 1.0235360227525234, | |
| "reward_std": 0.6361051239073277, | |
| "rewards/cosine_scaled_reward": 0.15760133787989616, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 47 | |
| }, | |
| { | |
| "completion_length": 2482.937530517578, | |
| "epoch": 0.054857142857142854, | |
| "grad_norm": 0.028333164751529694, | |
| "kl": 0.00011826306581497192, | |
| "learning_rate": 9.399999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 0.8281691037118435, | |
| "reward_std": 0.8235839083790779, | |
| "rewards/cosine_scaled_reward": 0.14325121929869056, | |
| "rewards/format_reward": 0.5416666734963655, | |
| "step": 48 | |
| }, | |
| { | |
| "completion_length": 1686.7708778381348, | |
| "epoch": 0.056, | |
| "grad_norm": 0.03345295041799545, | |
| "kl": 9.24915075302124e-05, | |
| "learning_rate": 9.6e-07, | |
| "loss": 0.0, | |
| "reward": 1.1297309827059507, | |
| "reward_std": 0.8849144196137786, | |
| "rewards/cosine_scaled_reward": 0.12736546620726585, | |
| "rewards/format_reward": 0.8750000074505806, | |
| "step": 49 | |
| }, | |
| { | |
| "completion_length": 2702.166679382324, | |
| "epoch": 0.05714285714285714, | |
| "grad_norm": 0.025781678035855293, | |
| "kl": 0.00011380016803741455, | |
| "learning_rate": 9.8e-07, | |
| "loss": 0.0, | |
| "reward": 0.774685425683856, | |
| "reward_std": 0.6316370200365782, | |
| "rewards/cosine_scaled_reward": 0.12692604400217533, | |
| "rewards/format_reward": 0.5208333376795053, | |
| "step": 50 | |
| }, | |
| { | |
| "completion_length": 2162.75, | |
| "epoch": 0.05828571428571429, | |
| "grad_norm": 0.032990217208862305, | |
| "kl": 0.0001423358917236328, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0, | |
| "reward": 0.395895104855299, | |
| "reward_std": 0.5788541845977306, | |
| "rewards/cosine_scaled_reward": -0.062469134107232094, | |
| "rewards/format_reward": 0.520833333954215, | |
| "step": 51 | |
| }, | |
| { | |
| "completion_length": 2481.3334045410156, | |
| "epoch": 0.05942857142857143, | |
| "grad_norm": 0.027054639533162117, | |
| "kl": 0.00010142475366592407, | |
| "learning_rate": 9.999890338174275e-07, | |
| "loss": 0.0, | |
| "reward": 1.1616107635200024, | |
| "reward_std": 0.6760222055017948, | |
| "rewards/cosine_scaled_reward": 0.23705538734793663, | |
| "rewards/format_reward": 0.6875000055879354, | |
| "step": 52 | |
| }, | |
| { | |
| "completion_length": 2468.5208892822266, | |
| "epoch": 0.060571428571428575, | |
| "grad_norm": 0.026934338733553886, | |
| "kl": 0.000138893723487854, | |
| "learning_rate": 9.999561358041868e-07, | |
| "loss": 0.0, | |
| "reward": 1.0014648959040642, | |
| "reward_std": 1.0805351361632347, | |
| "rewards/cosine_scaled_reward": 0.15698243118822575, | |
| "rewards/format_reward": 0.6875000055879354, | |
| "step": 53 | |
| }, | |
| { | |
| "completion_length": 1780.3333435058594, | |
| "epoch": 0.061714285714285715, | |
| "grad_norm": 0.030859723687171936, | |
| "kl": 9.08970832824707e-05, | |
| "learning_rate": 9.999013075636804e-07, | |
| "loss": 0.0, | |
| "reward": 1.5217821262776852, | |
| "reward_std": 0.8481816910207272, | |
| "rewards/cosine_scaled_reward": 0.35464105010032654, | |
| "rewards/format_reward": 0.8125000074505806, | |
| "step": 54 | |
| }, | |
| { | |
| "completion_length": 2687.4166946411133, | |
| "epoch": 0.06285714285714286, | |
| "grad_norm": 0.021781187504529953, | |
| "kl": 0.00013262033462524414, | |
| "learning_rate": 9.998245517681593e-07, | |
| "loss": 0.0, | |
| "reward": 1.0466104643419385, | |
| "reward_std": 0.878080858848989, | |
| "rewards/cosine_scaled_reward": 0.20038855448365211, | |
| "rewards/format_reward": 0.6458333414047956, | |
| "step": 55 | |
| }, | |
| { | |
| "completion_length": 2628.770866394043, | |
| "epoch": 0.064, | |
| "grad_norm": 0.02273709513247013, | |
| "kl": 0.0001190677285194397, | |
| "learning_rate": 9.997258721585931e-07, | |
| "loss": 0.0, | |
| "reward": 0.7287426479160786, | |
| "reward_std": 0.6065099984407425, | |
| "rewards/cosine_scaled_reward": 0.06228797510266304, | |
| "rewards/format_reward": 0.604166679084301, | |
| "step": 56 | |
| }, | |
| { | |
| "completion_length": 2883.854202270508, | |
| "epoch": 0.06514285714285714, | |
| "grad_norm": 0.017316030338406563, | |
| "kl": 0.00010225176811218262, | |
| "learning_rate": 9.996052735444862e-07, | |
| "loss": 0.0, | |
| "reward": 0.4611471053212881, | |
| "reward_std": 1.0018693208694458, | |
| "rewards/cosine_scaled_reward": -0.019426452228799462, | |
| "rewards/format_reward": 0.5000000055879354, | |
| "step": 57 | |
| }, | |
| { | |
| "completion_length": 1413.4791793823242, | |
| "epoch": 0.06628571428571428, | |
| "grad_norm": 0.03232548013329506, | |
| "kl": 6.647780537605286e-05, | |
| "learning_rate": 9.994627618036452e-07, | |
| "loss": 0.0, | |
| "reward": 0.8900517076253891, | |
| "reward_std": 0.7214644104242325, | |
| "rewards/cosine_scaled_reward": 0.049192506819963455, | |
| "rewards/format_reward": 0.7916666716337204, | |
| "step": 58 | |
| }, | |
| { | |
| "completion_length": 2609.520851135254, | |
| "epoch": 0.06742857142857143, | |
| "grad_norm": 0.02341977320611477, | |
| "kl": 0.00011394917964935303, | |
| "learning_rate": 9.992983438818915e-07, | |
| "loss": 0.0, | |
| "reward": 0.6282028257846832, | |
| "reward_std": 0.7330185724422336, | |
| "rewards/cosine_scaled_reward": 0.08493474265560508, | |
| "rewards/format_reward": 0.45833333395421505, | |
| "step": 59 | |
| }, | |
| { | |
| "completion_length": 2260.875030517578, | |
| "epoch": 0.06857142857142857, | |
| "grad_norm": 0.025622840970754623, | |
| "kl": 0.00010037794709205627, | |
| "learning_rate": 9.991120277927223e-07, | |
| "loss": 0.0, | |
| "reward": 0.7927640732377768, | |
| "reward_std": 0.5676146075129509, | |
| "rewards/cosine_scaled_reward": 0.04221535753458738, | |
| "rewards/format_reward": 0.7083333358168602, | |
| "step": 60 | |
| }, | |
| { | |
| "completion_length": 2364.1042098999023, | |
| "epoch": 0.06971428571428571, | |
| "grad_norm": 0.023006901144981384, | |
| "kl": 8.575618267059326e-05, | |
| "learning_rate": 9.989038226169207e-07, | |
| "loss": 0.0, | |
| "reward": 0.5264723375439644, | |
| "reward_std": 0.4836166054010391, | |
| "rewards/cosine_scaled_reward": -0.0388471744954586, | |
| "rewards/format_reward": 0.6041666716337204, | |
| "step": 61 | |
| }, | |
| { | |
| "completion_length": 2127.2500228881836, | |
| "epoch": 0.07085714285714285, | |
| "grad_norm": 0.024150002747774124, | |
| "kl": 8.403509855270386e-05, | |
| "learning_rate": 9.98673738502114e-07, | |
| "loss": 0.0, | |
| "reward": 1.038158729672432, | |
| "reward_std": 0.8189139366149902, | |
| "rewards/cosine_scaled_reward": 0.14407933596521616, | |
| "rewards/format_reward": 0.7500000074505806, | |
| "step": 62 | |
| }, | |
| { | |
| "completion_length": 1480.7500228881836, | |
| "epoch": 0.072, | |
| "grad_norm": 0.03183520957827568, | |
| "kl": 7.95125961303711e-05, | |
| "learning_rate": 9.98421786662277e-07, | |
| "loss": 0.0, | |
| "reward": 1.3979634009301662, | |
| "reward_std": 0.5990209653973579, | |
| "rewards/cosine_scaled_reward": 0.261481698602438, | |
| "rewards/format_reward": 0.875, | |
| "step": 63 | |
| }, | |
| { | |
| "completion_length": 2504.666679382324, | |
| "epoch": 0.07314285714285715, | |
| "grad_norm": 0.021520959213376045, | |
| "kl": 0.0001504272222518921, | |
| "learning_rate": 9.981479793771866e-07, | |
| "loss": 0.0, | |
| "reward": 0.8372010216116905, | |
| "reward_std": 0.6612134985625744, | |
| "rewards/cosine_scaled_reward": 0.1373505061492324, | |
| "rewards/format_reward": 0.5625000074505806, | |
| "step": 64 | |
| }, | |
| { | |
| "completion_length": 2580.6458702087402, | |
| "epoch": 0.07428571428571429, | |
| "grad_norm": 0.028981182724237442, | |
| "kl": 0.00011400878429412842, | |
| "learning_rate": 9.97852329991824e-07, | |
| "loss": 0.0, | |
| "reward": 0.45997084584087133, | |
| "reward_std": 0.7114468142390251, | |
| "rewards/cosine_scaled_reward": -0.030431261286139488, | |
| "rewards/format_reward": 0.520833345130086, | |
| "step": 65 | |
| }, | |
| { | |
| "completion_length": 2067.458335876465, | |
| "epoch": 0.07542857142857143, | |
| "grad_norm": 0.03032701462507248, | |
| "kl": 9.172409772872925e-05, | |
| "learning_rate": 9.975348529157229e-07, | |
| "loss": 0.0, | |
| "reward": 0.301434013992548, | |
| "reward_std": 0.645634401589632, | |
| "rewards/cosine_scaled_reward": -0.09928299766033888, | |
| "rewards/format_reward": 0.5, | |
| "step": 66 | |
| }, | |
| { | |
| "completion_length": 3204.3541717529297, | |
| "epoch": 0.07657142857142857, | |
| "grad_norm": 0.016771739348769188, | |
| "kl": 0.00011616945266723633, | |
| "learning_rate": 9.971955636222684e-07, | |
| "loss": 0.0, | |
| "reward": 0.028863554820418358, | |
| "reward_std": 0.6313999556005001, | |
| "rewards/cosine_scaled_reward": -0.14181822165846825, | |
| "rewards/format_reward": 0.31250000186264515, | |
| "step": 67 | |
| }, | |
| { | |
| "completion_length": 1526.895866394043, | |
| "epoch": 0.07771428571428571, | |
| "grad_norm": 0.03267313167452812, | |
| "kl": 8.743256330490112e-05, | |
| "learning_rate": 9.968344786479415e-07, | |
| "loss": 0.0, | |
| "reward": 0.8832469508051872, | |
| "reward_std": 0.6210112273693085, | |
| "rewards/cosine_scaled_reward": 0.06662345677614212, | |
| "rewards/format_reward": 0.7500000055879354, | |
| "step": 68 | |
| }, | |
| { | |
| "completion_length": 1829.0416946411133, | |
| "epoch": 0.07885714285714286, | |
| "grad_norm": 0.031190019100904465, | |
| "kl": 9.609758853912354e-05, | |
| "learning_rate": 9.964516155915151e-07, | |
| "loss": 0.0, | |
| "reward": 0.6412256211042404, | |
| "reward_std": 0.839095912873745, | |
| "rewards/cosine_scaled_reward": -0.03355386573821306, | |
| "rewards/format_reward": 0.7083333395421505, | |
| "step": 69 | |
| }, | |
| { | |
| "completion_length": 2334.7083740234375, | |
| "epoch": 0.08, | |
| "grad_norm": 0.023025985807180405, | |
| "kl": 9.860843420028687e-05, | |
| "learning_rate": 9.960469931131936e-07, | |
| "loss": 0.0, | |
| "reward": 0.6220458149909973, | |
| "reward_std": 0.6576723717153072, | |
| "rewards/cosine_scaled_reward": -0.011893758550286293, | |
| "rewards/format_reward": 0.645833333954215, | |
| "step": 70 | |
| }, | |
| { | |
| "completion_length": 2430.8750228881836, | |
| "epoch": 0.08114285714285714, | |
| "grad_norm": 0.03443196043372154, | |
| "kl": 0.00010547041893005371, | |
| "learning_rate": 9.956206309337066e-07, | |
| "loss": 0.0, | |
| "reward": 0.6613975018262863, | |
| "reward_std": 0.5863231625407934, | |
| "rewards/cosine_scaled_reward": 0.11194872949272394, | |
| "rewards/format_reward": 0.43750000186264515, | |
| "step": 71 | |
| }, | |
| { | |
| "completion_length": 2218.250045776367, | |
| "epoch": 0.08228571428571428, | |
| "grad_norm": 0.02904508076608181, | |
| "kl": 9.149312973022461e-05, | |
| "learning_rate": 9.951725498333448e-07, | |
| "loss": 0.0, | |
| "reward": 0.39750963170081377, | |
| "reward_std": 0.5485289674252272, | |
| "rewards/cosine_scaled_reward": -0.15541186556220055, | |
| "rewards/format_reward": 0.7083333469927311, | |
| "step": 72 | |
| }, | |
| { | |
| "completion_length": 3065.2708892822266, | |
| "epoch": 0.08342857142857144, | |
| "grad_norm": 0.01883835718035698, | |
| "kl": 0.0001595020294189453, | |
| "learning_rate": 9.947027716509488e-07, | |
| "loss": 0.0, | |
| "reward": 0.3231031158939004, | |
| "reward_std": 0.7491481080651283, | |
| "rewards/cosine_scaled_reward": -0.03636510670185089, | |
| "rewards/format_reward": 0.3958333395421505, | |
| "step": 73 | |
| }, | |
| { | |
| "completion_length": 2371.104202270508, | |
| "epoch": 0.08457142857142858, | |
| "grad_norm": 0.024749629199504852, | |
| "kl": 0.00012037158012390137, | |
| "learning_rate": 9.942113192828444e-07, | |
| "loss": 0.0, | |
| "reward": 0.6487875208258629, | |
| "reward_std": 0.839644305408001, | |
| "rewards/cosine_scaled_reward": 0.05356042645871639, | |
| "rewards/format_reward": 0.5416666716337204, | |
| "step": 74 | |
| }, | |
| { | |
| "completion_length": 2677.5, | |
| "epoch": 0.08571428571428572, | |
| "grad_norm": 0.02110193856060505, | |
| "kl": 0.0001303553581237793, | |
| "learning_rate": 9.93698216681727e-07, | |
| "loss": 0.0, | |
| "reward": 0.6778632658533752, | |
| "reward_std": 0.7005478721112013, | |
| "rewards/cosine_scaled_reward": 0.07851496431976557, | |
| "rewards/format_reward": 0.5208333414047956, | |
| "step": 75 | |
| }, | |
| { | |
| "completion_length": 2419.2291984558105, | |
| "epoch": 0.08685714285714285, | |
| "grad_norm": 0.02441054955124855, | |
| "kl": 0.0001291334629058838, | |
| "learning_rate": 9.931634888554935e-07, | |
| "loss": 0.0, | |
| "reward": 0.41384580731391907, | |
| "reward_std": 0.6501213163137436, | |
| "rewards/cosine_scaled_reward": -0.053493766114115715, | |
| "rewards/format_reward": 0.5208333358168602, | |
| "step": 76 | |
| }, | |
| { | |
| "completion_length": 2707.1042098999023, | |
| "epoch": 0.088, | |
| "grad_norm": 0.020344527438282967, | |
| "kl": 0.00013850629329681396, | |
| "learning_rate": 9.926071618660237e-07, | |
| "loss": 0.0, | |
| "reward": 0.2681877203285694, | |
| "reward_std": 0.5029121972620487, | |
| "rewards/cosine_scaled_reward": -0.11590614821761847, | |
| "rewards/format_reward": 0.5000000037252903, | |
| "step": 77 | |
| }, | |
| { | |
| "completion_length": 2731.479232788086, | |
| "epoch": 0.08914285714285715, | |
| "grad_norm": 0.01844891719520092, | |
| "kl": 0.00013375282287597656, | |
| "learning_rate": 9.9202926282791e-07, | |
| "loss": 0.0, | |
| "reward": 0.7639368114178069, | |
| "reward_std": 0.8214902877807617, | |
| "rewards/cosine_scaled_reward": 0.12155175860971212, | |
| "rewards/format_reward": 0.520833345130086, | |
| "step": 78 | |
| }, | |
| { | |
| "completion_length": 2004.6041946411133, | |
| "epoch": 0.09028571428571429, | |
| "grad_norm": 0.028990577906370163, | |
| "kl": 8.181855082511902e-05, | |
| "learning_rate": 9.91429819907136e-07, | |
| "loss": 0.0, | |
| "reward": 0.7587422616779804, | |
| "reward_std": 0.9989555478096008, | |
| "rewards/cosine_scaled_reward": 0.04603780619800091, | |
| "rewards/format_reward": 0.666666679084301, | |
| "step": 79 | |
| }, | |
| { | |
| "completion_length": 2782.375045776367, | |
| "epoch": 0.09142857142857143, | |
| "grad_norm": 0.023466741666197777, | |
| "kl": 0.00015717744827270508, | |
| "learning_rate": 9.908088623197048e-07, | |
| "loss": 0.0, | |
| "reward": 0.6499133557081223, | |
| "reward_std": 0.7965570129454136, | |
| "rewards/cosine_scaled_reward": 0.05412333086133003, | |
| "rewards/format_reward": 0.5416666716337204, | |
| "step": 80 | |
| }, | |
| { | |
| "completion_length": 2665.3542137145996, | |
| "epoch": 0.09257142857142857, | |
| "grad_norm": 0.028318636119365692, | |
| "kl": 0.0001677870750427246, | |
| "learning_rate": 9.901664203302124e-07, | |
| "loss": 0.0, | |
| "reward": 0.4032784029841423, | |
| "reward_std": 0.6976380785927176, | |
| "rewards/cosine_scaled_reward": -0.03794414922595024, | |
| "rewards/format_reward": 0.4791666753590107, | |
| "step": 81 | |
| }, | |
| { | |
| "completion_length": 2347.4583702087402, | |
| "epoch": 0.09371428571428571, | |
| "grad_norm": 0.025593508034944534, | |
| "kl": 0.00010985136032104492, | |
| "learning_rate": 9.895025252503755e-07, | |
| "loss": 0.0, | |
| "reward": 0.8746878691017628, | |
| "reward_std": 0.9405775591731071, | |
| "rewards/cosine_scaled_reward": 0.14567727083340287, | |
| "rewards/format_reward": 0.5833333376795053, | |
| "step": 82 | |
| }, | |
| { | |
| "completion_length": 2405.333366394043, | |
| "epoch": 0.09485714285714286, | |
| "grad_norm": 0.03943740949034691, | |
| "kl": 0.00012260675430297852, | |
| "learning_rate": 9.888172094375033e-07, | |
| "loss": 0.0, | |
| "reward": 0.6102542355656624, | |
| "reward_std": 0.8917163014411926, | |
| "rewards/cosine_scaled_reward": 0.03429375775158405, | |
| "rewards/format_reward": 0.5416666679084301, | |
| "step": 83 | |
| }, | |
| { | |
| "completion_length": 2356.6666984558105, | |
| "epoch": 0.096, | |
| "grad_norm": 0.023660918697714806, | |
| "kl": 0.00010949373245239258, | |
| "learning_rate": 9.881105062929221e-07, | |
| "loss": 0.0, | |
| "reward": 0.49356625229120255, | |
| "reward_std": 0.6785021275281906, | |
| "rewards/cosine_scaled_reward": -0.03446688875555992, | |
| "rewards/format_reward": 0.5625, | |
| "step": 84 | |
| }, | |
| { | |
| "completion_length": 2653.9791870117188, | |
| "epoch": 0.09714285714285714, | |
| "grad_norm": 0.01803443394601345, | |
| "kl": 9.615719318389893e-05, | |
| "learning_rate": 9.873824502603459e-07, | |
| "loss": 0.0, | |
| "reward": 0.8409973373636603, | |
| "reward_std": 0.8465002179145813, | |
| "rewards/cosine_scaled_reward": 0.10799866930756252, | |
| "rewards/format_reward": 0.6250000093132257, | |
| "step": 85 | |
| }, | |
| { | |
| "completion_length": 2493.8542098999023, | |
| "epoch": 0.09828571428571428, | |
| "grad_norm": 0.026323335245251656, | |
| "kl": 0.00012114644050598145, | |
| "learning_rate": 9.866330768241983e-07, | |
| "loss": 0.0, | |
| "reward": 0.48225442320108414, | |
| "reward_std": 0.4767768904566765, | |
| "rewards/cosine_scaled_reward": -0.00887279398739338, | |
| "rewards/format_reward": 0.5, | |
| "step": 86 | |
| }, | |
| { | |
| "completion_length": 2238.4583587646484, | |
| "epoch": 0.09942857142857142, | |
| "grad_norm": 0.02708311937749386, | |
| "kl": 0.00014419853687286377, | |
| "learning_rate": 9.85862422507884e-07, | |
| "loss": 0.0, | |
| "reward": 0.8275710507296026, | |
| "reward_std": 0.5411144681274891, | |
| "rewards/cosine_scaled_reward": 0.09086884278804064, | |
| "rewards/format_reward": 0.6458333432674408, | |
| "step": 87 | |
| }, | |
| { | |
| "completion_length": 1580.8958854675293, | |
| "epoch": 0.10057142857142858, | |
| "grad_norm": 0.029665708541870117, | |
| "kl": 8.863955736160278e-05, | |
| "learning_rate": 9.850705248720068e-07, | |
| "loss": 0.0, | |
| "reward": 0.9847916029393673, | |
| "reward_std": 0.9228500425815582, | |
| "rewards/cosine_scaled_reward": 0.08614580077119172, | |
| "rewards/format_reward": 0.812500013038516, | |
| "step": 88 | |
| }, | |
| { | |
| "completion_length": 2665.1666870117188, | |
| "epoch": 0.10171428571428572, | |
| "grad_norm": 0.019765589386224747, | |
| "kl": 0.00011581182479858398, | |
| "learning_rate": 9.8425742251254e-07, | |
| "loss": 0.0, | |
| "reward": 0.6976313246414065, | |
| "reward_std": 0.7233806774020195, | |
| "rewards/cosine_scaled_reward": 0.09881562972441316, | |
| "rewards/format_reward": 0.5000000055879354, | |
| "step": 89 | |
| }, | |
| { | |
| "completion_length": 2352.7291870117188, | |
| "epoch": 0.10285714285714286, | |
| "grad_norm": 0.04337649419903755, | |
| "kl": 0.00018215179443359375, | |
| "learning_rate": 9.83423155058946e-07, | |
| "loss": 0.0, | |
| "reward": 0.2358827404677868, | |
| "reward_std": 0.5784358195960522, | |
| "rewards/cosine_scaled_reward": -0.15289196744561195, | |
| "rewards/format_reward": 0.5416666734963655, | |
| "step": 90 | |
| }, | |
| { | |
| "completion_length": 2579.8750534057617, | |
| "epoch": 0.104, | |
| "grad_norm": 0.024008560925722122, | |
| "kl": 0.00012639164924621582, | |
| "learning_rate": 9.825677631722435e-07, | |
| "loss": 0.0, | |
| "reward": 0.5804571909829974, | |
| "reward_std": 0.7776114530861378, | |
| "rewards/cosine_scaled_reward": 0.029811910664648167, | |
| "rewards/format_reward": 0.5208333358168602, | |
| "step": 91 | |
| }, | |
| { | |
| "completion_length": 2127.750015258789, | |
| "epoch": 0.10514285714285715, | |
| "grad_norm": 0.03029821440577507, | |
| "kl": 9.115040302276611e-05, | |
| "learning_rate": 9.816912885430258e-07, | |
| "loss": 0.0, | |
| "reward": 0.6724662911146879, | |
| "reward_std": 0.40310859866440296, | |
| "rewards/cosine_scaled_reward": -0.059600187465548515, | |
| "rewards/format_reward": 0.7916666716337204, | |
| "step": 92 | |
| }, | |
| { | |
| "completion_length": 3542.4791870117188, | |
| "epoch": 0.10628571428571429, | |
| "grad_norm": 0.01921100728213787, | |
| "kl": 0.00022071599960327148, | |
| "learning_rate": 9.807937738894303e-07, | |
| "loss": 0.0, | |
| "reward": -0.3310100920498371, | |
| "reward_std": 0.44439256377518177, | |
| "rewards/cosine_scaled_reward": -0.19675504975020885, | |
| "rewards/format_reward": 0.06250000186264515, | |
| "step": 93 | |
| }, | |
| { | |
| "completion_length": 2373.1667251586914, | |
| "epoch": 0.10742857142857143, | |
| "grad_norm": 0.02489403821527958, | |
| "kl": 0.0001574680209159851, | |
| "learning_rate": 9.798752629550546e-07, | |
| "loss": 0.0, | |
| "reward": 0.7052749944850802, | |
| "reward_std": 0.8667173758149147, | |
| "rewards/cosine_scaled_reward": 0.05055414792150259, | |
| "rewards/format_reward": 0.6041666716337204, | |
| "step": 94 | |
| }, | |
| { | |
| "completion_length": 3027.5208740234375, | |
| "epoch": 0.10857142857142857, | |
| "grad_norm": 0.02021307684481144, | |
| "kl": 0.0001308917999267578, | |
| "learning_rate": 9.78935800506826e-07, | |
| "loss": 0.0, | |
| "reward": 0.38812466710805893, | |
| "reward_std": 0.7133069280534983, | |
| "rewards/cosine_scaled_reward": -0.03510434227064252, | |
| "rewards/format_reward": 0.4583333395421505, | |
| "step": 95 | |
| }, | |
| { | |
| "completion_length": 2473.1875762939453, | |
| "epoch": 0.10971428571428571, | |
| "grad_norm": 0.030283445492386818, | |
| "kl": 9.927153587341309e-05, | |
| "learning_rate": 9.779754323328192e-07, | |
| "loss": 0.0, | |
| "reward": 0.6445761788636446, | |
| "reward_std": 0.8469636254012585, | |
| "rewards/cosine_scaled_reward": 0.04103806708008051, | |
| "rewards/format_reward": 0.5625000055879354, | |
| "step": 96 | |
| }, | |
| { | |
| "completion_length": 2826.166732788086, | |
| "epoch": 0.11085714285714286, | |
| "grad_norm": 0.022094760090112686, | |
| "kl": 0.00012320280075073242, | |
| "learning_rate": 9.769942052400235e-07, | |
| "loss": 0.0, | |
| "reward": 0.33709235209971666, | |
| "reward_std": 0.8025432769209146, | |
| "rewards/cosine_scaled_reward": -0.03978714719414711, | |
| "rewards/format_reward": 0.4166666753590107, | |
| "step": 97 | |
| }, | |
| { | |
| "completion_length": 2312.9166946411133, | |
| "epoch": 0.112, | |
| "grad_norm": 0.032762184739112854, | |
| "kl": 9.972602128982544e-05, | |
| "learning_rate": 9.759921670520634e-07, | |
| "loss": 0.0, | |
| "reward": 0.34186697006225586, | |
| "reward_std": 0.7580292820930481, | |
| "rewards/cosine_scaled_reward": -0.06864983681589365, | |
| "rewards/format_reward": 0.47916667722165585, | |
| "step": 98 | |
| }, | |
| { | |
| "completion_length": 2789.12504196167, | |
| "epoch": 0.11314285714285714, | |
| "grad_norm": 0.0221230611205101, | |
| "kl": 0.00011648237705230713, | |
| "learning_rate": 9.749693666068663e-07, | |
| "loss": 0.0, | |
| "reward": 0.3996657454408705, | |
| "reward_std": 0.7801091782748699, | |
| "rewards/cosine_scaled_reward": -0.02933379588648677, | |
| "rewards/format_reward": 0.45833334140479565, | |
| "step": 99 | |
| }, | |
| { | |
| "completion_length": 2257.708381652832, | |
| "epoch": 0.11428571428571428, | |
| "grad_norm": 0.023916251957416534, | |
| "kl": 0.00011393427848815918, | |
| "learning_rate": 9.739258537542835e-07, | |
| "loss": 0.0, | |
| "reward": 1.030206087976694, | |
| "reward_std": 0.9744261428713799, | |
| "rewards/cosine_scaled_reward": 0.1921863555908203, | |
| "rewards/format_reward": 0.6458333414047956, | |
| "step": 100 | |
| }, | |
| { | |
| "completion_length": 2282.979202270508, | |
| "epoch": 0.11542857142857142, | |
| "grad_norm": 0.029999660328030586, | |
| "kl": 0.0001456141471862793, | |
| "learning_rate": 9.728616793536587e-07, | |
| "loss": 0.0, | |
| "reward": 0.869966953061521, | |
| "reward_std": 0.784905806183815, | |
| "rewards/cosine_scaled_reward": 0.11206682212650776, | |
| "rewards/format_reward": 0.6458333488553762, | |
| "step": 101 | |
| }, | |
| { | |
| "completion_length": 2075.9792098999023, | |
| "epoch": 0.11657142857142858, | |
| "grad_norm": 0.030493436381220818, | |
| "kl": 0.00013086199760437012, | |
| "learning_rate": 9.717768952713511e-07, | |
| "loss": 0.0, | |
| "reward": 0.8771389909088612, | |
| "reward_std": 0.7964347116649151, | |
| "rewards/cosine_scaled_reward": 0.07398613449186087, | |
| "rewards/format_reward": 0.7291666753590107, | |
| "step": 102 | |
| }, | |
| { | |
| "completion_length": 2310.7500228881836, | |
| "epoch": 0.11771428571428572, | |
| "grad_norm": 0.029018821194767952, | |
| "kl": 0.00010515749454498291, | |
| "learning_rate": 9.706715543782064e-07, | |
| "loss": 0.0, | |
| "reward": 0.5800389759242535, | |
| "reward_std": 0.5602545086294413, | |
| "rewards/cosine_scaled_reward": -0.06414720602333546, | |
| "rewards/format_reward": 0.7083333358168602, | |
| "step": 103 | |
| }, | |
| { | |
| "completion_length": 2372.104263305664, | |
| "epoch": 0.11885714285714286, | |
| "grad_norm": 0.027296504005789757, | |
| "kl": 0.0001329481601715088, | |
| "learning_rate": 9.695457105469804e-07, | |
| "loss": 0.0, | |
| "reward": 0.5882483441382647, | |
| "reward_std": 0.730236142873764, | |
| "rewards/cosine_scaled_reward": -0.028792519122362137, | |
| "rewards/format_reward": 0.6458333395421505, | |
| "step": 104 | |
| }, | |
| { | |
| "completion_length": 2313.625030517578, | |
| "epoch": 0.12, | |
| "grad_norm": 0.025855960324406624, | |
| "kl": 0.00012581050395965576, | |
| "learning_rate": 9.683994186497132e-07, | |
| "loss": 0.0, | |
| "reward": 1.154417909681797, | |
| "reward_std": 0.6686892919242382, | |
| "rewards/cosine_scaled_reward": 0.25429226853884757, | |
| "rewards/format_reward": 0.6458333395421505, | |
| "step": 105 | |
| }, | |
| { | |
| "completion_length": 1879.3125457763672, | |
| "epoch": 0.12114285714285715, | |
| "grad_norm": 0.030279330909252167, | |
| "kl": 7.063150405883789e-05, | |
| "learning_rate": 9.672327345550543e-07, | |
| "loss": 0.0, | |
| "reward": 1.0762072280049324, | |
| "reward_std": 0.7403409704566002, | |
| "rewards/cosine_scaled_reward": 0.1735202744603157, | |
| "rewards/format_reward": 0.7291666697710752, | |
| "step": 106 | |
| }, | |
| { | |
| "completion_length": 2561.50004196167, | |
| "epoch": 0.12228571428571429, | |
| "grad_norm": 0.023640897125005722, | |
| "kl": 0.00015266239643096924, | |
| "learning_rate": 9.66045715125541e-07, | |
| "loss": 0.0, | |
| "reward": 0.47847325541079044, | |
| "reward_std": 0.5093330107629299, | |
| "rewards/cosine_scaled_reward": -0.02118003647774458, | |
| "rewards/format_reward": 0.520833333954215, | |
| "step": 107 | |
| }, | |
| { | |
| "completion_length": 2796.229179382324, | |
| "epoch": 0.12342857142857143, | |
| "grad_norm": 0.02216712199151516, | |
| "kl": 0.00013267993927001953, | |
| "learning_rate": 9.648384182148252e-07, | |
| "loss": 0.0, | |
| "reward": 0.4137321300804615, | |
| "reward_std": 0.7223830446600914, | |
| "rewards/cosine_scaled_reward": -0.0014672763645648956, | |
| "rewards/format_reward": 0.41666667349636555, | |
| "step": 108 | |
| }, | |
| { | |
| "completion_length": 2795.687545776367, | |
| "epoch": 0.12457142857142857, | |
| "grad_norm": 0.022554874420166016, | |
| "kl": 0.0001379549503326416, | |
| "learning_rate": 9.636109026648554e-07, | |
| "loss": 0.0, | |
| "reward": 0.27347924932837486, | |
| "reward_std": 0.6205065222457051, | |
| "rewards/cosine_scaled_reward": -0.050760375801473856, | |
| "rewards/format_reward": 0.3750000037252903, | |
| "step": 109 | |
| }, | |
| { | |
| "completion_length": 2653.0417442321777, | |
| "epoch": 0.12571428571428572, | |
| "grad_norm": 0.02770637348294258, | |
| "kl": 0.00011685490608215332, | |
| "learning_rate": 9.623632283030077e-07, | |
| "loss": 0.0, | |
| "reward": 1.0205977819859982, | |
| "reward_std": 0.9722634367644787, | |
| "rewards/cosine_scaled_reward": 0.17696558311581612, | |
| "rewards/format_reward": 0.6666666828095913, | |
| "step": 110 | |
| }, | |
| { | |
| "completion_length": 2789.4583587646484, | |
| "epoch": 0.12685714285714286, | |
| "grad_norm": 0.0231799129396677, | |
| "kl": 0.00014919042587280273, | |
| "learning_rate": 9.610954559391704e-07, | |
| "loss": 0.0, | |
| "reward": 0.5839154049754143, | |
| "reward_std": 0.6967787519097328, | |
| "rewards/cosine_scaled_reward": 0.07320770528167486, | |
| "rewards/format_reward": 0.43750001303851604, | |
| "step": 111 | |
| }, | |
| { | |
| "completion_length": 3039.875030517578, | |
| "epoch": 0.128, | |
| "grad_norm": 0.021811209619045258, | |
| "kl": 0.00016495585441589355, | |
| "learning_rate": 9.598076473627796e-07, | |
| "loss": 0.0, | |
| "reward": 0.43579965829849243, | |
| "reward_std": 0.8692431841045618, | |
| "rewards/cosine_scaled_reward": -0.0008501838892698288, | |
| "rewards/format_reward": 0.4375000111758709, | |
| "step": 112 | |
| }, | |
| { | |
| "completion_length": 2002.0000305175781, | |
| "epoch": 0.12914285714285714, | |
| "grad_norm": 0.03178806230425835, | |
| "kl": 0.00013971328735351562, | |
| "learning_rate": 9.58499865339809e-07, | |
| "loss": 0.0, | |
| "reward": 0.7515672761946917, | |
| "reward_std": 0.8758837655186653, | |
| "rewards/cosine_scaled_reward": 0.0007836292497813702, | |
| "rewards/format_reward": 0.7500000093132257, | |
| "step": 113 | |
| }, | |
| { | |
| "completion_length": 2049.791717529297, | |
| "epoch": 0.13028571428571428, | |
| "grad_norm": 0.030137214809656143, | |
| "kl": 8.422881364822388e-05, | |
| "learning_rate": 9.571721736097088e-07, | |
| "loss": 0.0, | |
| "reward": 0.46851017512381077, | |
| "reward_std": 0.3973765755072236, | |
| "rewards/cosine_scaled_reward": -0.11991160549223423, | |
| "rewards/format_reward": 0.708333333954215, | |
| "step": 114 | |
| }, | |
| { | |
| "completion_length": 2772.979217529297, | |
| "epoch": 0.13142857142857142, | |
| "grad_norm": 0.031248675659298897, | |
| "kl": 0.00012229382991790771, | |
| "learning_rate": 9.55824636882301e-07, | |
| "loss": 0.0, | |
| "reward": 0.7633647555485368, | |
| "reward_std": 0.8922017849981785, | |
| "rewards/cosine_scaled_reward": 0.14209904265590012, | |
| "rewards/format_reward": 0.47916667349636555, | |
| "step": 115 | |
| }, | |
| { | |
| "completion_length": 3117.1875534057617, | |
| "epoch": 0.13257142857142856, | |
| "grad_norm": 0.025444043800234795, | |
| "kl": 0.00015610456466674805, | |
| "learning_rate": 9.54457320834625e-07, | |
| "loss": 0.0, | |
| "reward": 0.1580726346001029, | |
| "reward_std": 0.614492192864418, | |
| "rewards/cosine_scaled_reward": -0.08763036131858826, | |
| "rewards/format_reward": 0.3333333395421505, | |
| "step": 116 | |
| }, | |
| { | |
| "completion_length": 2826.8958892822266, | |
| "epoch": 0.1337142857142857, | |
| "grad_norm": 0.02326161414384842, | |
| "kl": 0.00017911195755004883, | |
| "learning_rate": 9.530702921077358e-07, | |
| "loss": 0.0, | |
| "reward": 0.10269369930028915, | |
| "reward_std": 0.7328709326684475, | |
| "rewards/cosine_scaled_reward": -0.14656982268206775, | |
| "rewards/format_reward": 0.3958333469927311, | |
| "step": 117 | |
| }, | |
| { | |
| "completion_length": 2824.666717529297, | |
| "epoch": 0.13485714285714287, | |
| "grad_norm": 0.020892256870865822, | |
| "kl": 0.00012445449829101562, | |
| "learning_rate": 9.516636183034564e-07, | |
| "loss": 0.0, | |
| "reward": 1.3870177865028381, | |
| "reward_std": 1.0715553350746632, | |
| "rewards/cosine_scaled_reward": 0.3601755518466234, | |
| "rewards/format_reward": 0.6666666753590107, | |
| "step": 118 | |
| }, | |
| { | |
| "completion_length": 1789.0000267028809, | |
| "epoch": 0.136, | |
| "grad_norm": 0.042789019644260406, | |
| "kl": 9.068846702575684e-05, | |
| "learning_rate": 9.502373679810839e-07, | |
| "loss": 0.0, | |
| "reward": 0.9923834577202797, | |
| "reward_std": 0.4148991871625185, | |
| "rewards/cosine_scaled_reward": 0.1420250441879034, | |
| "rewards/format_reward": 0.7083333358168602, | |
| "step": 119 | |
| }, | |
| { | |
| "completion_length": 2159.7292098999023, | |
| "epoch": 0.13714285714285715, | |
| "grad_norm": 0.030027495697140694, | |
| "kl": 0.00016351789236068726, | |
| "learning_rate": 9.487916106540465e-07, | |
| "loss": 0.0, | |
| "reward": 0.6492169927805662, | |
| "reward_std": 0.827868428081274, | |
| "rewards/cosine_scaled_reward": 0.032941827550530434, | |
| "rewards/format_reward": 0.5833333432674408, | |
| "step": 120 | |
| }, | |
| { | |
| "completion_length": 1691.541690826416, | |
| "epoch": 0.1382857142857143, | |
| "grad_norm": 0.03286458179354668, | |
| "kl": 0.00010415911674499512, | |
| "learning_rate": 9.473264167865171e-07, | |
| "loss": 0.0, | |
| "reward": 0.7581329848617315, | |
| "reward_std": 0.4227004833519459, | |
| "rewards/cosine_scaled_reward": 0.03531647473573685, | |
| "rewards/format_reward": 0.6875000018626451, | |
| "step": 121 | |
| }, | |
| { | |
| "completion_length": 2760.31254196167, | |
| "epoch": 0.13942857142857143, | |
| "grad_norm": 0.024439577013254166, | |
| "kl": 0.0001647472381591797, | |
| "learning_rate": 9.458418577899774e-07, | |
| "loss": 0.0, | |
| "reward": 0.755975735373795, | |
| "reward_std": 0.785768523812294, | |
| "rewards/cosine_scaled_reward": 0.08632119931280613, | |
| "rewards/format_reward": 0.583333333954215, | |
| "step": 122 | |
| }, | |
| { | |
| "completion_length": 2559.270896911621, | |
| "epoch": 0.14057142857142857, | |
| "grad_norm": 0.021681685000658035, | |
| "kl": 0.00013074278831481934, | |
| "learning_rate": 9.443380060197385e-07, | |
| "loss": 0.0, | |
| "reward": 0.5207496341317892, | |
| "reward_std": 0.6123828925192356, | |
| "rewards/cosine_scaled_reward": -0.010458520613610744, | |
| "rewards/format_reward": 0.5416666772216558, | |
| "step": 123 | |
| }, | |
| { | |
| "completion_length": 1917.479175567627, | |
| "epoch": 0.1417142857142857, | |
| "grad_norm": 0.028799263760447502, | |
| "kl": 7.950514554977417e-05, | |
| "learning_rate": 9.428149347714143e-07, | |
| "loss": 0.0, | |
| "reward": 1.3661205926910043, | |
| "reward_std": 0.6406003027223051, | |
| "rewards/cosine_scaled_reward": 0.3288935967721045, | |
| "rewards/format_reward": 0.7083333358168602, | |
| "step": 124 | |
| }, | |
| { | |
| "completion_length": 2284.5833435058594, | |
| "epoch": 0.14285714285714285, | |
| "grad_norm": 0.026874002069234848, | |
| "kl": 9.514018893241882e-05, | |
| "learning_rate": 9.412727182773486e-07, | |
| "loss": 0.0, | |
| "reward": 0.4183652549982071, | |
| "reward_std": 0.5504244212061167, | |
| "rewards/cosine_scaled_reward": -0.019984053447842598, | |
| "rewards/format_reward": 0.4583333432674408, | |
| "step": 125 | |
| }, | |
| { | |
| "completion_length": 2366.625045776367, | |
| "epoch": 0.144, | |
| "grad_norm": 0.025750989094376564, | |
| "kl": 0.0001022443175315857, | |
| "learning_rate": 9.397114317029974e-07, | |
| "loss": 0.0, | |
| "reward": 0.8881729152053595, | |
| "reward_std": 0.8806649167090654, | |
| "rewards/cosine_scaled_reward": 0.14200311340391636, | |
| "rewards/format_reward": 0.604166679084301, | |
| "step": 126 | |
| }, | |
| { | |
| "completion_length": 3196.8750610351562, | |
| "epoch": 0.14514285714285713, | |
| "grad_norm": 0.01978475973010063, | |
| "kl": 0.000141829252243042, | |
| "learning_rate": 9.381311511432658e-07, | |
| "loss": 0.0, | |
| "reward": -0.040465060621500015, | |
| "reward_std": 0.6252349764108658, | |
| "rewards/cosine_scaled_reward": -0.17648251354694366, | |
| "rewards/format_reward": 0.31250000931322575, | |
| "step": 127 | |
| }, | |
| { | |
| "completion_length": 2271.8958740234375, | |
| "epoch": 0.1462857142857143, | |
| "grad_norm": 0.029708651825785637, | |
| "kl": 0.00015082955360412598, | |
| "learning_rate": 9.36531953618799e-07, | |
| "loss": 0.0, | |
| "reward": 0.825566089246422, | |
| "reward_std": 0.7982673496007919, | |
| "rewards/cosine_scaled_reward": 0.11069970577955246, | |
| "rewards/format_reward": 0.6041666753590107, | |
| "step": 128 | |
| }, | |
| { | |
| "completion_length": 3237.4583435058594, | |
| "epoch": 0.14742857142857144, | |
| "grad_norm": 0.019838884472846985, | |
| "kl": 0.0001277327537536621, | |
| "learning_rate": 9.34913917072228e-07, | |
| "loss": 0.0, | |
| "reward": 0.4639533148147166, | |
| "reward_std": 0.9510225504636765, | |
| "rewards/cosine_scaled_reward": 0.03405998833477497, | |
| "rewards/format_reward": 0.39583334140479565, | |
| "step": 129 | |
| }, | |
| { | |
| "completion_length": 3055.3333587646484, | |
| "epoch": 0.14857142857142858, | |
| "grad_norm": 0.020251061767339706, | |
| "kl": 0.0001876354217529297, | |
| "learning_rate": 9.332771203643714e-07, | |
| "loss": 0.0, | |
| "reward": 0.29083502665162086, | |
| "reward_std": 0.9767868965864182, | |
| "rewards/cosine_scaled_reward": -0.00041582807898521423, | |
| "rewards/format_reward": 0.2916666679084301, | |
| "step": 130 | |
| }, | |
| { | |
| "completion_length": 2579.000068664551, | |
| "epoch": 0.14971428571428572, | |
| "grad_norm": 0.023820001631975174, | |
| "kl": 0.0001468360424041748, | |
| "learning_rate": 9.316216432703916e-07, | |
| "loss": 0.0, | |
| "reward": 0.7045300118625164, | |
| "reward_std": 0.47794024273753166, | |
| "rewards/cosine_scaled_reward": 0.0814316663891077, | |
| "rewards/format_reward": 0.5416666716337204, | |
| "step": 131 | |
| }, | |
| { | |
| "completion_length": 2430.1250228881836, | |
| "epoch": 0.15085714285714286, | |
| "grad_norm": 0.024699199944734573, | |
| "kl": 0.00010822713375091553, | |
| "learning_rate": 9.299475664759068e-07, | |
| "loss": 0.0, | |
| "reward": 0.9790843389928341, | |
| "reward_std": 0.9423319976776838, | |
| "rewards/cosine_scaled_reward": 0.22912549204193056, | |
| "rewards/format_reward": 0.5208333358168602, | |
| "step": 132 | |
| }, | |
| { | |
| "completion_length": 3047.2083587646484, | |
| "epoch": 0.152, | |
| "grad_norm": 0.020359130576252937, | |
| "kl": 0.00017851591110229492, | |
| "learning_rate": 9.282549715730579e-07, | |
| "loss": 0.0, | |
| "reward": 0.1808095509186387, | |
| "reward_std": 0.9317280426621437, | |
| "rewards/cosine_scaled_reward": -0.08667857074760832, | |
| "rewards/format_reward": 0.3541666753590107, | |
| "step": 133 | |
| }, | |
| { | |
| "completion_length": 2291.5833892822266, | |
| "epoch": 0.15314285714285714, | |
| "grad_norm": 0.02574433758854866, | |
| "kl": 0.00011102622374892235, | |
| "learning_rate": 9.265439410565328e-07, | |
| "loss": 0.0, | |
| "reward": 0.8931092359125614, | |
| "reward_std": 0.7652852088212967, | |
| "rewards/cosine_scaled_reward": 0.14447128726169467, | |
| "rewards/format_reward": 0.6041666734963655, | |
| "step": 134 | |
| }, | |
| { | |
| "completion_length": 1257.270839691162, | |
| "epoch": 0.15428571428571428, | |
| "grad_norm": 0.037061907351017, | |
| "kl": 6.86943531036377e-05, | |
| "learning_rate": 9.248145583195447e-07, | |
| "loss": 0.0, | |
| "reward": 1.4347281754016876, | |
| "reward_std": 0.5897002862766385, | |
| "rewards/cosine_scaled_reward": 0.2798640683759004, | |
| "rewards/format_reward": 0.8750000055879354, | |
| "step": 135 | |
| }, | |
| { | |
| "completion_length": 2301.437515258789, | |
| "epoch": 0.15542857142857142, | |
| "grad_norm": 0.029598237946629524, | |
| "kl": 0.00010699033737182617, | |
| "learning_rate": 9.230669076497687e-07, | |
| "loss": 0.0, | |
| "reward": 1.0288100242614746, | |
| "reward_std": 0.6650557238608599, | |
| "rewards/cosine_scaled_reward": 0.2227383404970169, | |
| "rewards/format_reward": 0.5833333358168602, | |
| "step": 136 | |
| }, | |
| { | |
| "completion_length": 2888.0833740234375, | |
| "epoch": 0.15657142857142858, | |
| "grad_norm": 0.021254906430840492, | |
| "kl": 0.00012347102165222168, | |
| "learning_rate": 9.213010742252327e-07, | |
| "loss": 0.0, | |
| "reward": 0.15984412096440792, | |
| "reward_std": 0.8047252930700779, | |
| "rewards/cosine_scaled_reward": -0.12841127801220864, | |
| "rewards/format_reward": 0.41666667722165585, | |
| "step": 137 | |
| }, | |
| { | |
| "completion_length": 2664.2500610351562, | |
| "epoch": 0.15771428571428572, | |
| "grad_norm": 0.04367408901453018, | |
| "kl": 0.00011092424392700195, | |
| "learning_rate": 9.195171441101668e-07, | |
| "loss": 0.0, | |
| "reward": 0.13648264110088348, | |
| "reward_std": 0.6316529109608382, | |
| "rewards/cosine_scaled_reward": -0.12967534782364964, | |
| "rewards/format_reward": 0.3958333432674408, | |
| "step": 138 | |
| }, | |
| { | |
| "completion_length": 2937.541717529297, | |
| "epoch": 0.15885714285714286, | |
| "grad_norm": 0.02476118691265583, | |
| "kl": 0.00015249848365783691, | |
| "learning_rate": 9.177152042508077e-07, | |
| "loss": 0.0, | |
| "reward": 0.49358947202563286, | |
| "reward_std": 0.7895509861409664, | |
| "rewards/cosine_scaled_reward": -0.003205273300409317, | |
| "rewards/format_reward": 0.5000000018626451, | |
| "step": 139 | |
| }, | |
| { | |
| "completion_length": 2945.854202270508, | |
| "epoch": 0.16, | |
| "grad_norm": 0.021247655153274536, | |
| "kl": 0.00016349554061889648, | |
| "learning_rate": 9.158953424711624e-07, | |
| "loss": 0.0, | |
| "reward": 0.7320265481248498, | |
| "reward_std": 1.0867303423583508, | |
| "rewards/cosine_scaled_reward": 0.12642992846667767, | |
| "rewards/format_reward": 0.4791666828095913, | |
| "step": 140 | |
| }, | |
| { | |
| "completion_length": 2825.7292251586914, | |
| "epoch": 0.16114285714285714, | |
| "grad_norm": 0.02051922306418419, | |
| "kl": 0.0001229420304298401, | |
| "learning_rate": 9.140576474687263e-07, | |
| "loss": 0.0, | |
| "reward": 0.4059164673089981, | |
| "reward_std": 0.9311309345066547, | |
| "rewards/cosine_scaled_reward": -0.047041770070791245, | |
| "rewards/format_reward": 0.5000000055879354, | |
| "step": 141 | |
| }, | |
| { | |
| "completion_length": 2637.9583892822266, | |
| "epoch": 0.16228571428571428, | |
| "grad_norm": 0.021125253289937973, | |
| "kl": 0.0001480579376220703, | |
| "learning_rate": 9.122022088101613e-07, | |
| "loss": 0.0, | |
| "reward": 0.559799462556839, | |
| "reward_std": 0.6969177909195423, | |
| "rewards/cosine_scaled_reward": -0.0221836119890213, | |
| "rewards/format_reward": 0.6041666697710752, | |
| "step": 142 | |
| }, | |
| { | |
| "completion_length": 2216.8750762939453, | |
| "epoch": 0.16342857142857142, | |
| "grad_norm": 0.028433097526431084, | |
| "kl": 0.00013668090105056763, | |
| "learning_rate": 9.103291169269299e-07, | |
| "loss": 0.0, | |
| "reward": 0.6880653910338879, | |
| "reward_std": 0.7187580987811089, | |
| "rewards/cosine_scaled_reward": -0.020550659857690334, | |
| "rewards/format_reward": 0.7291666697710752, | |
| "step": 143 | |
| }, | |
| { | |
| "completion_length": 2552.895881652832, | |
| "epoch": 0.16457142857142856, | |
| "grad_norm": 0.026991255581378937, | |
| "kl": 0.00011313706636428833, | |
| "learning_rate": 9.084384631108882e-07, | |
| "loss": 0.0, | |
| "reward": 0.8902260856702924, | |
| "reward_std": 1.0206835009157658, | |
| "rewards/cosine_scaled_reward": 0.16386302933096886, | |
| "rewards/format_reward": 0.5625000111758709, | |
| "step": 144 | |
| }, | |
| { | |
| "completion_length": 1938.041732788086, | |
| "epoch": 0.1657142857142857, | |
| "grad_norm": 0.032270271331071854, | |
| "kl": 0.00011307001113891602, | |
| "learning_rate": 9.065303395098358e-07, | |
| "loss": 0.0, | |
| "reward": 0.7428423997480422, | |
| "reward_std": 0.5232174023985863, | |
| "rewards/cosine_scaled_reward": 0.05892119184136391, | |
| "rewards/format_reward": 0.6250000055879354, | |
| "step": 145 | |
| }, | |
| { | |
| "completion_length": 2219.791702270508, | |
| "epoch": 0.16685714285714287, | |
| "grad_norm": 0.02581549808382988, | |
| "kl": 0.00010230764746665955, | |
| "learning_rate": 9.046048391230247e-07, | |
| "loss": 0.0, | |
| "reward": 0.07503729220479727, | |
| "reward_std": 0.5477688852697611, | |
| "rewards/cosine_scaled_reward": -0.2541480294894427, | |
| "rewards/format_reward": 0.5833333414047956, | |
| "step": 146 | |
| }, | |
| { | |
| "completion_length": 3379.812530517578, | |
| "epoch": 0.168, | |
| "grad_norm": 0.01836305670440197, | |
| "kl": 0.00017067790031433105, | |
| "learning_rate": 9.026620557966279e-07, | |
| "loss": 0.0, | |
| "reward": 0.33875247836112976, | |
| "reward_std": 0.7588643953204155, | |
| "rewards/cosine_scaled_reward": 0.013126220554113388, | |
| "rewards/format_reward": 0.31250001303851604, | |
| "step": 147 | |
| }, | |
| { | |
| "completion_length": 2358.4166870117188, | |
| "epoch": 0.16914285714285715, | |
| "grad_norm": 0.02416214346885681, | |
| "kl": 0.0001198500394821167, | |
| "learning_rate": 9.007020842191634e-07, | |
| "loss": 0.0, | |
| "reward": 0.6435984447598457, | |
| "reward_std": 0.7261539697647095, | |
| "rewards/cosine_scaled_reward": 0.009299194440245628, | |
| "rewards/format_reward": 0.6250000074505806, | |
| "step": 148 | |
| }, | |
| { | |
| "completion_length": 2277.3750534057617, | |
| "epoch": 0.1702857142857143, | |
| "grad_norm": 0.021401530131697655, | |
| "kl": 8.6270272731781e-05, | |
| "learning_rate": 8.987250199168808e-07, | |
| "loss": 0.0, | |
| "reward": 1.1666594594717026, | |
| "reward_std": 0.8260795883834362, | |
| "rewards/cosine_scaled_reward": 0.18749637342989445, | |
| "rewards/format_reward": 0.7916666753590107, | |
| "step": 149 | |
| }, | |
| { | |
| "completion_length": 2718.6667098999023, | |
| "epoch": 0.17142857142857143, | |
| "grad_norm": 0.026421265676617622, | |
| "kl": 0.0001287609338760376, | |
| "learning_rate": 8.967309592491052e-07, | |
| "loss": 0.0, | |
| "reward": 0.5824301056563854, | |
| "reward_std": 0.8789757378399372, | |
| "rewards/cosine_scaled_reward": 0.07246502116322517, | |
| "rewards/format_reward": 0.4375000037252903, | |
| "step": 150 | |
| }, | |
| { | |
| "completion_length": 2435.0834045410156, | |
| "epoch": 0.17257142857142857, | |
| "grad_norm": 0.022044222801923752, | |
| "kl": 0.00015279650688171387, | |
| "learning_rate": 8.9471999940354e-07, | |
| "loss": 0.0, | |
| "reward": 1.020676076412201, | |
| "reward_std": 0.8065766766667366, | |
| "rewards/cosine_scaled_reward": 0.1770047047175467, | |
| "rewards/format_reward": 0.6666666772216558, | |
| "step": 151 | |
| }, | |
| { | |
| "completion_length": 2837.125030517578, | |
| "epoch": 0.1737142857142857, | |
| "grad_norm": 0.03187905251979828, | |
| "kl": 0.00020840764045715332, | |
| "learning_rate": 8.926922383915315e-07, | |
| "loss": 0.0, | |
| "reward": -0.09739869087934494, | |
| "reward_std": 0.588227073661983, | |
| "rewards/cosine_scaled_reward": -0.21536601521074772, | |
| "rewards/format_reward": 0.33333333395421505, | |
| "step": 152 | |
| }, | |
| { | |
| "completion_length": 2711.1666984558105, | |
| "epoch": 0.17485714285714285, | |
| "grad_norm": 0.02911432646214962, | |
| "kl": 0.00015884637832641602, | |
| "learning_rate": 8.906477750432903e-07, | |
| "loss": 0.0, | |
| "reward": 0.37669877288863063, | |
| "reward_std": 0.8355071609839797, | |
| "rewards/cosine_scaled_reward": -0.051233954494819045, | |
| "rewards/format_reward": 0.4791666716337204, | |
| "step": 153 | |
| }, | |
| { | |
| "completion_length": 2756.3750228881836, | |
| "epoch": 0.176, | |
| "grad_norm": 0.02044747956097126, | |
| "kl": 0.00012248754501342773, | |
| "learning_rate": 8.88586709003076e-07, | |
| "loss": 0.0, | |
| "reward": 0.9225640147924423, | |
| "reward_std": 0.8925285078585148, | |
| "rewards/cosine_scaled_reward": 0.18003201112151146, | |
| "rewards/format_reward": 0.5625000055879354, | |
| "step": 154 | |
| }, | |
| { | |
| "completion_length": 2223.312526702881, | |
| "epoch": 0.17714285714285713, | |
| "grad_norm": 0.03391764312982559, | |
| "kl": 0.00013962388038635254, | |
| "learning_rate": 8.865091407243394e-07, | |
| "loss": 0.0, | |
| "reward": 0.7340945638716221, | |
| "reward_std": 0.9665955938398838, | |
| "rewards/cosine_scaled_reward": 0.0649639368057251, | |
| "rewards/format_reward": 0.6041666753590107, | |
| "step": 155 | |
| }, | |
| { | |
| "completion_length": 2733.0417098999023, | |
| "epoch": 0.1782857142857143, | |
| "grad_norm": 0.024309324100613594, | |
| "kl": 0.00012956559658050537, | |
| "learning_rate": 8.844151714648274e-07, | |
| "loss": 0.0, | |
| "reward": 0.21320721879601479, | |
| "reward_std": 0.7121351584792137, | |
| "rewards/cosine_scaled_reward": -0.08089640364050865, | |
| "rewards/format_reward": 0.3750000037252903, | |
| "step": 156 | |
| }, | |
| { | |
| "completion_length": 2663.6041870117188, | |
| "epoch": 0.17942857142857144, | |
| "grad_norm": 0.023216916248202324, | |
| "kl": 0.00014782696962356567, | |
| "learning_rate": 8.823049032816478e-07, | |
| "loss": 0.0, | |
| "reward": 0.14814166724681854, | |
| "reward_std": 0.3913048207759857, | |
| "rewards/cosine_scaled_reward": -0.16551249055191875, | |
| "rewards/format_reward": 0.4791666716337204, | |
| "step": 157 | |
| }, | |
| { | |
| "completion_length": 2328.541702270508, | |
| "epoch": 0.18057142857142858, | |
| "grad_norm": 0.03534715995192528, | |
| "kl": 0.00013265013694763184, | |
| "learning_rate": 8.801784390262943e-07, | |
| "loss": 0.0, | |
| "reward": 0.9912874735891819, | |
| "reward_std": 0.8466089218854904, | |
| "rewards/cosine_scaled_reward": 0.1727270409464836, | |
| "rewards/format_reward": 0.645833345130086, | |
| "step": 158 | |
| }, | |
| { | |
| "completion_length": 3219.0833587646484, | |
| "epoch": 0.18171428571428572, | |
| "grad_norm": 0.02012258768081665, | |
| "kl": 0.00016486644744873047, | |
| "learning_rate": 8.780358823396352e-07, | |
| "loss": 0.0, | |
| "reward": 0.21142571535892785, | |
| "reward_std": 0.7264946773648262, | |
| "rewards/cosine_scaled_reward": -0.019287142204120755, | |
| "rewards/format_reward": 0.25000000186264515, | |
| "step": 159 | |
| }, | |
| { | |
| "completion_length": 2777.000045776367, | |
| "epoch": 0.18285714285714286, | |
| "grad_norm": 0.025370828807353973, | |
| "kl": 0.0001633763313293457, | |
| "learning_rate": 8.758773376468604e-07, | |
| "loss": 0.0, | |
| "reward": 0.5651849992573261, | |
| "reward_std": 0.7188206501305103, | |
| "rewards/cosine_scaled_reward": 0.06384249404072762, | |
| "rewards/format_reward": 0.43750000558793545, | |
| "step": 160 | |
| }, | |
| { | |
| "completion_length": 2474.541732788086, | |
| "epoch": 0.184, | |
| "grad_norm": 0.030321095138788223, | |
| "kl": 0.00015471689403057098, | |
| "learning_rate": 8.737029101523929e-07, | |
| "loss": 0.0, | |
| "reward": 0.7223290950059891, | |
| "reward_std": 0.8424487709999084, | |
| "rewards/cosine_scaled_reward": 0.05908120982348919, | |
| "rewards/format_reward": 0.6041666828095913, | |
| "step": 161 | |
| }, | |
| { | |
| "completion_length": 3007.937530517578, | |
| "epoch": 0.18514285714285714, | |
| "grad_norm": 0.023185160011053085, | |
| "kl": 0.00018095970153808594, | |
| "learning_rate": 8.715127058347614e-07, | |
| "loss": 0.0, | |
| "reward": 0.43497929722070694, | |
| "reward_std": 0.8792226985096931, | |
| "rewards/cosine_scaled_reward": 0.009156312793493271, | |
| "rewards/format_reward": 0.4166666753590107, | |
| "step": 162 | |
| }, | |
| { | |
| "completion_length": 2370.145866394043, | |
| "epoch": 0.18628571428571428, | |
| "grad_norm": 0.025774484500288963, | |
| "kl": 0.00015078485012054443, | |
| "learning_rate": 8.693068314414344e-07, | |
| "loss": 0.0, | |
| "reward": 0.9850538782775402, | |
| "reward_std": 0.45399810932576656, | |
| "rewards/cosine_scaled_reward": 0.1904435846954584, | |
| "rewards/format_reward": 0.6041666716337204, | |
| "step": 163 | |
| }, | |
| { | |
| "completion_length": 2291.64591217041, | |
| "epoch": 0.18742857142857142, | |
| "grad_norm": 0.032293785363435745, | |
| "kl": 0.00010757520794868469, | |
| "learning_rate": 8.670853944836176e-07, | |
| "loss": 0.0, | |
| "reward": 0.8117690533399582, | |
| "reward_std": 0.7221045140177011, | |
| "rewards/cosine_scaled_reward": 0.11421786062419415, | |
| "rewards/format_reward": 0.5833333376795053, | |
| "step": 164 | |
| }, | |
| { | |
| "completion_length": 2779.3125534057617, | |
| "epoch": 0.18857142857142858, | |
| "grad_norm": 0.02514166198670864, | |
| "kl": 0.0001589655876159668, | |
| "learning_rate": 8.648485032310144e-07, | |
| "loss": 0.0, | |
| "reward": 0.16848786175251007, | |
| "reward_std": 0.7497027926146984, | |
| "rewards/cosine_scaled_reward": -0.11367274331860244, | |
| "rewards/format_reward": 0.39583333395421505, | |
| "step": 165 | |
| }, | |
| { | |
| "completion_length": 2580.3542251586914, | |
| "epoch": 0.18971428571428572, | |
| "grad_norm": 0.01945258490741253, | |
| "kl": 0.00012123212218284607, | |
| "learning_rate": 8.625962667065487e-07, | |
| "loss": 0.0, | |
| "reward": 0.7501739612780511, | |
| "reward_std": 1.0527099072933197, | |
| "rewards/cosine_scaled_reward": 0.10425363201647997, | |
| "rewards/format_reward": 0.5416666753590107, | |
| "step": 166 | |
| }, | |
| { | |
| "completion_length": 2261.9166946411133, | |
| "epoch": 0.19085714285714286, | |
| "grad_norm": 0.024124927818775177, | |
| "kl": 0.00012174248695373535, | |
| "learning_rate": 8.603287946810513e-07, | |
| "loss": 0.0, | |
| "reward": 0.5743089579045773, | |
| "reward_std": 0.7672125902026892, | |
| "rewards/cosine_scaled_reward": -0.014928878052160144, | |
| "rewards/format_reward": 0.604166679084301, | |
| "step": 167 | |
| }, | |
| { | |
| "completion_length": 2618.8958587646484, | |
| "epoch": 0.192, | |
| "grad_norm": 0.02464590221643448, | |
| "kl": 0.00015217065811157227, | |
| "learning_rate": 8.580461976679099e-07, | |
| "loss": 0.0, | |
| "reward": 0.6550418548285961, | |
| "reward_std": 0.8852942362427711, | |
| "rewards/cosine_scaled_reward": 0.056687585078179836, | |
| "rewards/format_reward": 0.5416666753590107, | |
| "step": 168 | |
| }, | |
| { | |
| "completion_length": 1920.0208740234375, | |
| "epoch": 0.19314285714285714, | |
| "grad_norm": 0.02443297766149044, | |
| "kl": 0.00012825848534703255, | |
| "learning_rate": 8.557485869176825e-07, | |
| "loss": 0.0, | |
| "reward": 1.19900780916214, | |
| "reward_std": 0.48580580204725266, | |
| "rewards/cosine_scaled_reward": 0.2870038769906387, | |
| "rewards/format_reward": 0.6250000055879354, | |
| "step": 169 | |
| }, | |
| { | |
| "completion_length": 2522.541702270508, | |
| "epoch": 0.19428571428571428, | |
| "grad_norm": 0.028783444315195084, | |
| "kl": 0.00019599497318267822, | |
| "learning_rate": 8.534360744126753e-07, | |
| "loss": 0.0, | |
| "reward": 0.48285481217317283, | |
| "reward_std": 0.5786700565367937, | |
| "rewards/cosine_scaled_reward": 0.0018440568819642067, | |
| "rewards/format_reward": 0.4791666716337204, | |
| "step": 170 | |
| }, | |
| { | |
| "completion_length": 2279.4166870117188, | |
| "epoch": 0.19542857142857142, | |
| "grad_norm": 0.028025727719068527, | |
| "kl": 0.0001086406409740448, | |
| "learning_rate": 8.511087728614862e-07, | |
| "loss": 0.0, | |
| "reward": 0.6512754829600453, | |
| "reward_std": 0.7119449377059937, | |
| "rewards/cosine_scaled_reward": 0.06522107869386673, | |
| "rewards/format_reward": 0.520833345130086, | |
| "step": 171 | |
| }, | |
| { | |
| "completion_length": 2717.416732788086, | |
| "epoch": 0.19657142857142856, | |
| "grad_norm": 0.0323350615799427, | |
| "kl": 0.00017327070236206055, | |
| "learning_rate": 8.487667956935087e-07, | |
| "loss": 0.0, | |
| "reward": 0.43108027055859566, | |
| "reward_std": 0.6473873369395733, | |
| "rewards/cosine_scaled_reward": 0.007206789217889309, | |
| "rewards/format_reward": 0.4166666716337204, | |
| "step": 172 | |
| }, | |
| { | |
| "completion_length": 1855.1458549499512, | |
| "epoch": 0.1977142857142857, | |
| "grad_norm": 0.05176561325788498, | |
| "kl": 0.00010795891284942627, | |
| "learning_rate": 8.464102570534061e-07, | |
| "loss": 0.0, | |
| "reward": 0.4854015044402331, | |
| "reward_std": 0.7094262056052685, | |
| "rewards/cosine_scaled_reward": -0.059382592036854476, | |
| "rewards/format_reward": 0.604166679084301, | |
| "step": 173 | |
| }, | |
| { | |
| "completion_length": 1968.1458587646484, | |
| "epoch": 0.19885714285714284, | |
| "grad_norm": 0.026019148528575897, | |
| "kl": 0.0001271367073059082, | |
| "learning_rate": 8.440392717955475e-07, | |
| "loss": 0.0, | |
| "reward": 0.9800305366516113, | |
| "reward_std": 0.8449381552636623, | |
| "rewards/cosine_scaled_reward": 0.09418194030877203, | |
| "rewards/format_reward": 0.791666679084301, | |
| "step": 174 | |
| }, | |
| { | |
| "completion_length": 2335.395835876465, | |
| "epoch": 0.2, | |
| "grad_norm": 0.02863323502242565, | |
| "kl": 0.00011295080184936523, | |
| "learning_rate": 8.416539554784089e-07, | |
| "loss": 0.0, | |
| "reward": 0.6143988557159901, | |
| "reward_std": 0.4556568693369627, | |
| "rewards/cosine_scaled_reward": 0.05719940923154354, | |
| "rewards/format_reward": 0.5, | |
| "step": 175 | |
| }, | |
| { | |
| "completion_length": 2449.2708778381348, | |
| "epoch": 0.20114285714285715, | |
| "grad_norm": 0.02869235724210739, | |
| "kl": 0.00014215707778930664, | |
| "learning_rate": 8.392544243589427e-07, | |
| "loss": 0.0, | |
| "reward": 1.1555413324385881, | |
| "reward_std": 1.0306697580963373, | |
| "rewards/cosine_scaled_reward": 0.24443731893552467, | |
| "rewards/format_reward": 0.6666666753590107, | |
| "step": 176 | |
| }, | |
| { | |
| "completion_length": 2694.562515258789, | |
| "epoch": 0.2022857142857143, | |
| "grad_norm": 0.023254889994859695, | |
| "kl": 0.00017695128917694092, | |
| "learning_rate": 8.368407953869103e-07, | |
| "loss": 0.0, | |
| "reward": 0.26045722933486104, | |
| "reward_std": 0.7560887522995472, | |
| "rewards/cosine_scaled_reward": -0.11977138603106141, | |
| "rewards/format_reward": 0.5000000074505806, | |
| "step": 177 | |
| }, | |
| { | |
| "completion_length": 2326.645896911621, | |
| "epoch": 0.20342857142857143, | |
| "grad_norm": 0.05164008587598801, | |
| "kl": 0.0001480579376220703, | |
| "learning_rate": 8.344131861991828e-07, | |
| "loss": 0.0, | |
| "reward": 0.6920488923788071, | |
| "reward_std": 0.6076631285250187, | |
| "rewards/cosine_scaled_reward": 0.0439411043189466, | |
| "rewards/format_reward": 0.6041666716337204, | |
| "step": 178 | |
| }, | |
| { | |
| "completion_length": 2577.729202270508, | |
| "epoch": 0.20457142857142857, | |
| "grad_norm": 0.025095604360103607, | |
| "kl": 0.0001359879970550537, | |
| "learning_rate": 8.319717151140072e-07, | |
| "loss": 0.0, | |
| "reward": 0.22021218854933977, | |
| "reward_std": 0.6630701459944248, | |
| "rewards/cosine_scaled_reward": -0.1294772457331419, | |
| "rewards/format_reward": 0.47916666977107525, | |
| "step": 179 | |
| }, | |
| { | |
| "completion_length": 2143.1666870117188, | |
| "epoch": 0.2057142857142857, | |
| "grad_norm": 0.03228743001818657, | |
| "kl": 0.00014454126358032227, | |
| "learning_rate": 8.295165011252396e-07, | |
| "loss": 0.0, | |
| "reward": 0.7503797290846705, | |
| "reward_std": 0.2802760358899832, | |
| "rewards/cosine_scaled_reward": 0.06268983148038387, | |
| "rewards/format_reward": 0.625, | |
| "step": 180 | |
| }, | |
| { | |
| "completion_length": 2969.541702270508, | |
| "epoch": 0.20685714285714285, | |
| "grad_norm": 0.022628450766205788, | |
| "kl": 0.00018687546253204346, | |
| "learning_rate": 8.270476638965461e-07, | |
| "loss": 0.0, | |
| "reward": 0.40104219946078956, | |
| "reward_std": 0.6805736906826496, | |
| "rewards/cosine_scaled_reward": 0.002604421228170395, | |
| "rewards/format_reward": 0.3958333358168602, | |
| "step": 181 | |
| }, | |
| { | |
| "completion_length": 1776.2708587646484, | |
| "epoch": 0.208, | |
| "grad_norm": 0.025273211300373077, | |
| "kl": 9.524030610918999e-05, | |
| "learning_rate": 8.245653237555705e-07, | |
| "loss": 0.0, | |
| "reward": 0.8596987724304199, | |
| "reward_std": 0.5180976707488298, | |
| "rewards/cosine_scaled_reward": 0.05484938062727451, | |
| "rewards/format_reward": 0.75, | |
| "step": 182 | |
| }, | |
| { | |
| "completion_length": 1851.4792022705078, | |
| "epoch": 0.20914285714285713, | |
| "grad_norm": 0.027847742661833763, | |
| "kl": 9.390711784362793e-05, | |
| "learning_rate": 8.220696016880687e-07, | |
| "loss": 0.0, | |
| "reward": 0.8440241813659668, | |
| "reward_std": 0.6227804413065314, | |
| "rewards/cosine_scaled_reward": 0.06784541811794043, | |
| "rewards/format_reward": 0.7083333358168602, | |
| "step": 183 | |
| }, | |
| { | |
| "completion_length": 2858.520854949951, | |
| "epoch": 0.2102857142857143, | |
| "grad_norm": 0.035307928919792175, | |
| "kl": 0.0001779794692993164, | |
| "learning_rate": 8.195606193320136e-07, | |
| "loss": 0.0, | |
| "reward": 0.06458963826298714, | |
| "reward_std": 0.34594947658479214, | |
| "rewards/cosine_scaled_reward": -0.14478851668536663, | |
| "rewards/format_reward": 0.3541666716337204, | |
| "step": 184 | |
| }, | |
| { | |
| "completion_length": 2389.1250038146973, | |
| "epoch": 0.21142857142857144, | |
| "grad_norm": 0.02831236459314823, | |
| "kl": 0.00010818615555763245, | |
| "learning_rate": 8.170384989716657e-07, | |
| "loss": 0.0, | |
| "reward": 0.5413765218108892, | |
| "reward_std": 0.4714433569461107, | |
| "rewards/cosine_scaled_reward": -0.00014507770538330078, | |
| "rewards/format_reward": 0.5416666679084301, | |
| "step": 185 | |
| }, | |
| { | |
| "completion_length": 2767.6666870117188, | |
| "epoch": 0.21257142857142858, | |
| "grad_norm": 0.023117488250136375, | |
| "kl": 0.0001722574234008789, | |
| "learning_rate": 8.145033635316128e-07, | |
| "loss": 0.0, | |
| "reward": 0.41337184607982635, | |
| "reward_std": 0.5102072861045599, | |
| "rewards/cosine_scaled_reward": 0.01918593794107437, | |
| "rewards/format_reward": 0.375, | |
| "step": 186 | |
| }, | |
| { | |
| "completion_length": 2513.5000381469727, | |
| "epoch": 0.21371428571428572, | |
| "grad_norm": 0.028418753296136856, | |
| "kl": 0.00018349289894104004, | |
| "learning_rate": 8.119553365707802e-07, | |
| "loss": 0.0, | |
| "reward": 0.25105898082256317, | |
| "reward_std": 0.609873728826642, | |
| "rewards/cosine_scaled_reward": -0.10363717749714851, | |
| "rewards/format_reward": 0.4583333395421505, | |
| "step": 187 | |
| }, | |
| { | |
| "completion_length": 3422.1458435058594, | |
| "epoch": 0.21485714285714286, | |
| "grad_norm": 0.017747724428772926, | |
| "kl": 0.00021284818649291992, | |
| "learning_rate": 8.093945422764069e-07, | |
| "loss": 0.0, | |
| "reward": 0.0721919247880578, | |
| "reward_std": 0.6335567105561495, | |
| "rewards/cosine_scaled_reward": -0.07848738227039576, | |
| "rewards/format_reward": 0.2291666716337204, | |
| "step": 188 | |
| }, | |
| { | |
| "completion_length": 1912.0000305175781, | |
| "epoch": 0.216, | |
| "grad_norm": 0.03303361311554909, | |
| "kl": 0.00013190507888793945, | |
| "learning_rate": 8.068211054579943e-07, | |
| "loss": 0.0, | |
| "reward": 0.6457886800635606, | |
| "reward_std": 0.5706375325098634, | |
| "rewards/cosine_scaled_reward": -0.010438980534672737, | |
| "rewards/format_reward": 0.6666666679084301, | |
| "step": 189 | |
| }, | |
| { | |
| "completion_length": 2566.1666831970215, | |
| "epoch": 0.21714285714285714, | |
| "grad_norm": 0.02325809746980667, | |
| "kl": 0.00013524293899536133, | |
| "learning_rate": 8.04235151541222e-07, | |
| "loss": 0.0, | |
| "reward": 0.6750274561345577, | |
| "reward_std": 0.8541564792394638, | |
| "rewards/cosine_scaled_reward": 0.08751372992992401, | |
| "rewards/format_reward": 0.5000000074505806, | |
| "step": 190 | |
| }, | |
| { | |
| "completion_length": 2105.062530517578, | |
| "epoch": 0.21828571428571428, | |
| "grad_norm": 0.029074614867568016, | |
| "kl": 0.0001640021800994873, | |
| "learning_rate": 8.01636806561836e-07, | |
| "loss": 0.0, | |
| "reward": 1.0512720253318548, | |
| "reward_std": 0.7890468016266823, | |
| "rewards/cosine_scaled_reward": 0.1923026624135673, | |
| "rewards/format_reward": 0.6666666734963655, | |
| "step": 191 | |
| }, | |
| { | |
| "completion_length": 3006.625045776367, | |
| "epoch": 0.21942857142857142, | |
| "grad_norm": 0.018196215853095055, | |
| "kl": 0.0001971721649169922, | |
| "learning_rate": 7.990261971595048e-07, | |
| "loss": 0.0, | |
| "reward": 0.1856545265763998, | |
| "reward_std": 0.5431043151766062, | |
| "rewards/cosine_scaled_reward": -0.11550608882680535, | |
| "rewards/format_reward": 0.41666667722165585, | |
| "step": 192 | |
| }, | |
| { | |
| "completion_length": 2791.708396911621, | |
| "epoch": 0.22057142857142858, | |
| "grad_norm": 0.026738908141851425, | |
| "kl": 0.00015562772750854492, | |
| "learning_rate": 7.964034505716476e-07, | |
| "loss": 0.0, | |
| "reward": 0.6532938107848167, | |
| "reward_std": 0.8042124956846237, | |
| "rewards/cosine_scaled_reward": 0.03498022351413965, | |
| "rewards/format_reward": 0.5833333432674408, | |
| "step": 193 | |
| }, | |
| { | |
| "completion_length": 2549.3333740234375, | |
| "epoch": 0.22171428571428572, | |
| "grad_norm": 0.02125261351466179, | |
| "kl": 0.00017492473125457764, | |
| "learning_rate": 7.93768694627233e-07, | |
| "loss": 0.0, | |
| "reward": 1.3440267005935311, | |
| "reward_std": 0.6356805637478828, | |
| "rewards/cosine_scaled_reward": 0.33868001215159893, | |
| "rewards/format_reward": 0.6666666734963655, | |
| "step": 194 | |
| }, | |
| { | |
| "completion_length": 2541.9375610351562, | |
| "epoch": 0.22285714285714286, | |
| "grad_norm": 0.021625977009534836, | |
| "kl": 0.00013524293899536133, | |
| "learning_rate": 7.911220577405484e-07, | |
| "loss": 0.0, | |
| "reward": 0.45053595933131874, | |
| "reward_std": 0.7725311703979969, | |
| "rewards/cosine_scaled_reward": -0.055982012301683426, | |
| "rewards/format_reward": 0.5625000018626451, | |
| "step": 195 | |
| }, | |
| { | |
| "completion_length": 3335.7083435058594, | |
| "epoch": 0.224, | |
| "grad_norm": 0.019518742337822914, | |
| "kl": 0.00020563602447509766, | |
| "learning_rate": 7.884636689049422e-07, | |
| "loss": 0.0, | |
| "reward": 0.07744309306144714, | |
| "reward_std": 0.6891645342111588, | |
| "rewards/cosine_scaled_reward": -0.13836179114878178, | |
| "rewards/format_reward": 0.354166679084301, | |
| "step": 196 | |
| }, | |
| { | |
| "completion_length": 2233.312545776367, | |
| "epoch": 0.22514285714285714, | |
| "grad_norm": 0.031853772699832916, | |
| "kl": 0.00016062520444393158, | |
| "learning_rate": 7.857936576865356e-07, | |
| "loss": 0.0, | |
| "reward": 1.7223949134349823, | |
| "reward_std": 1.0976359993219376, | |
| "rewards/cosine_scaled_reward": 0.47578078508377075, | |
| "rewards/format_reward": 0.7708333432674408, | |
| "step": 197 | |
| }, | |
| { | |
| "completion_length": 2403.312545776367, | |
| "epoch": 0.22628571428571428, | |
| "grad_norm": 0.02528354711830616, | |
| "kl": 0.0001506805419921875, | |
| "learning_rate": 7.831121542179086e-07, | |
| "loss": 0.0, | |
| "reward": 0.725980069488287, | |
| "reward_std": 0.9397312067449093, | |
| "rewards/cosine_scaled_reward": 0.07132336869835854, | |
| "rewards/format_reward": 0.583333345130086, | |
| "step": 198 | |
| }, | |
| { | |
| "completion_length": 3550.500030517578, | |
| "epoch": 0.22742857142857142, | |
| "grad_norm": 0.01737813465297222, | |
| "kl": 0.00018775463104248047, | |
| "learning_rate": 7.804192891917571e-07, | |
| "loss": 0.0, | |
| "reward": -0.2171228351071477, | |
| "reward_std": 0.7574310433119535, | |
| "rewards/cosine_scaled_reward": -0.17106142034754157, | |
| "rewards/format_reward": 0.12500000186264515, | |
| "step": 199 | |
| }, | |
| { | |
| "completion_length": 1939.5833740234375, | |
| "epoch": 0.22857142857142856, | |
| "grad_norm": 0.025537090376019478, | |
| "kl": 0.00011917948722839355, | |
| "learning_rate": 7.777151938545235e-07, | |
| "loss": 0.0, | |
| "reward": 1.1004857262596488, | |
| "reward_std": 0.7319156341254711, | |
| "rewards/cosine_scaled_reward": 0.16482618637382984, | |
| "rewards/format_reward": 0.7708333358168602, | |
| "step": 200 | |
| }, | |
| { | |
| "completion_length": 2528.5209045410156, | |
| "epoch": 0.2297142857142857, | |
| "grad_norm": 0.026324188336730003, | |
| "kl": 0.00013250857591629028, | |
| "learning_rate": 7.75e-07, | |
| "loss": 0.0, | |
| "reward": 1.465837650001049, | |
| "reward_std": 0.9725578310899436, | |
| "rewards/cosine_scaled_reward": 0.3683354835957289, | |
| "rewards/format_reward": 0.7291666828095913, | |
| "step": 201 | |
| }, | |
| { | |
| "completion_length": 2129.250030517578, | |
| "epoch": 0.23085714285714284, | |
| "grad_norm": 0.027547147125005722, | |
| "kl": 0.00012653321027755737, | |
| "learning_rate": 7.72273839962904e-07, | |
| "loss": 0.0, | |
| "reward": 1.2678888477385044, | |
| "reward_std": 0.42415672820061445, | |
| "rewards/cosine_scaled_reward": 0.3422777857631445, | |
| "rewards/format_reward": 0.5833333358168602, | |
| "step": 202 | |
| }, | |
| { | |
| "completion_length": 3210.416717529297, | |
| "epoch": 0.232, | |
| "grad_norm": 0.02026360109448433, | |
| "kl": 0.0002213120460510254, | |
| "learning_rate": 7.695368466124296e-07, | |
| "loss": 0.0, | |
| "reward": 0.32417835108935833, | |
| "reward_std": 0.8184224963188171, | |
| "rewards/cosine_scaled_reward": -0.014994161203503609, | |
| "rewards/format_reward": 0.35416667349636555, | |
| "step": 203 | |
| }, | |
| { | |
| "completion_length": 1677.7083892822266, | |
| "epoch": 0.23314285714285715, | |
| "grad_norm": 0.03866266459226608, | |
| "kl": 0.0001564323902130127, | |
| "learning_rate": 7.667891533457718e-07, | |
| "loss": 0.0, | |
| "reward": 0.9787286967039108, | |
| "reward_std": 0.7050071842968464, | |
| "rewards/cosine_scaled_reward": 0.11436433251947165, | |
| "rewards/format_reward": 0.7500000018626451, | |
| "step": 204 | |
| }, | |
| { | |
| "completion_length": 2191.7500762939453, | |
| "epoch": 0.2342857142857143, | |
| "grad_norm": 0.026666179299354553, | |
| "kl": 0.00016039609909057617, | |
| "learning_rate": 7.640308940816239e-07, | |
| "loss": 0.0, | |
| "reward": 1.4601695500314236, | |
| "reward_std": 0.8596113231033087, | |
| "rewards/cosine_scaled_reward": 0.3863347489386797, | |
| "rewards/format_reward": 0.687500013038516, | |
| "step": 205 | |
| }, | |
| { | |
| "completion_length": 2625.3958435058594, | |
| "epoch": 0.23542857142857143, | |
| "grad_norm": 0.02177976816892624, | |
| "kl": 0.00015854835510253906, | |
| "learning_rate": 7.612622032536507e-07, | |
| "loss": 0.0, | |
| "reward": 0.12173939868807793, | |
| "reward_std": 0.6548620201647282, | |
| "rewards/cosine_scaled_reward": -0.15788030996918678, | |
| "rewards/format_reward": 0.4375000074505806, | |
| "step": 206 | |
| }, | |
| { | |
| "completion_length": 2883.0625610351562, | |
| "epoch": 0.23657142857142857, | |
| "grad_norm": 0.026248862966895103, | |
| "kl": 0.00018286705017089844, | |
| "learning_rate": 7.584832158039378e-07, | |
| "loss": 0.0, | |
| "reward": 0.2657412812113762, | |
| "reward_std": 0.7520406804978848, | |
| "rewards/cosine_scaled_reward": -0.11712937615811825, | |
| "rewards/format_reward": 0.5000000093132257, | |
| "step": 207 | |
| }, | |
| { | |
| "completion_length": 2729.125015258789, | |
| "epoch": 0.2377142857142857, | |
| "grad_norm": 0.022093147039413452, | |
| "kl": 0.0001411736011505127, | |
| "learning_rate": 7.556940671764124e-07, | |
| "loss": 0.0, | |
| "reward": 0.47902350744698197, | |
| "reward_std": 0.7772372476756573, | |
| "rewards/cosine_scaled_reward": 0.020761748775839806, | |
| "rewards/format_reward": 0.43750000558793545, | |
| "step": 208 | |
| }, | |
| { | |
| "completion_length": 2192.9792289733887, | |
| "epoch": 0.23885714285714285, | |
| "grad_norm": 0.026345793157815933, | |
| "kl": 0.00014121830463409424, | |
| "learning_rate": 7.528948933102438e-07, | |
| "loss": 0.0, | |
| "reward": 1.030539353378117, | |
| "reward_std": 0.5581800434738398, | |
| "rewards/cosine_scaled_reward": 0.16110298968851566, | |
| "rewards/format_reward": 0.7083333395421505, | |
| "step": 209 | |
| }, | |
| { | |
| "completion_length": 2763.8541870117188, | |
| "epoch": 0.24, | |
| "grad_norm": 0.0209537073969841, | |
| "kl": 0.00013713538646697998, | |
| "learning_rate": 7.500858306332172e-07, | |
| "loss": 0.0, | |
| "reward": 0.7295061359182, | |
| "reward_std": 0.8547591110691428, | |
| "rewards/cosine_scaled_reward": 0.10433638375252485, | |
| "rewards/format_reward": 0.5208333358168602, | |
| "step": 210 | |
| }, | |
| { | |
| "completion_length": 2132.6041870117188, | |
| "epoch": 0.24114285714285713, | |
| "grad_norm": 0.029190730303525925, | |
| "kl": 0.00014358758926391602, | |
| "learning_rate": 7.472670160550848e-07, | |
| "loss": 0.0, | |
| "reward": 0.844088938087225, | |
| "reward_std": 0.7269210359081626, | |
| "rewards/cosine_scaled_reward": 0.09912779554724693, | |
| "rewards/format_reward": 0.6458333358168602, | |
| "step": 211 | |
| }, | |
| { | |
| "completion_length": 1901.6458587646484, | |
| "epoch": 0.2422857142857143, | |
| "grad_norm": 0.03538208082318306, | |
| "kl": 0.00014218688011169434, | |
| "learning_rate": 7.444385869608921e-07, | |
| "loss": 0.0, | |
| "reward": 0.6045566711109132, | |
| "reward_std": 0.5201306939125061, | |
| "rewards/cosine_scaled_reward": -0.010221696458756924, | |
| "rewards/format_reward": 0.6250000037252903, | |
| "step": 212 | |
| }, | |
| { | |
| "completion_length": 1943.0000381469727, | |
| "epoch": 0.24342857142857144, | |
| "grad_norm": 0.03535102307796478, | |
| "kl": 0.00016012787818908691, | |
| "learning_rate": 7.416006812042827e-07, | |
| "loss": 0.0, | |
| "reward": 1.4567882642149925, | |
| "reward_std": 0.7881541578099132, | |
| "rewards/cosine_scaled_reward": 0.3742274232208729, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 213 | |
| }, | |
| { | |
| "completion_length": 2420.416702270508, | |
| "epoch": 0.24457142857142858, | |
| "grad_norm": 0.03149150311946869, | |
| "kl": 0.00019598007202148438, | |
| "learning_rate": 7.387534371007797e-07, | |
| "loss": 0.0, | |
| "reward": 0.46870883740484715, | |
| "reward_std": 0.7327337116003036, | |
| "rewards/cosine_scaled_reward": -0.06772891082800925, | |
| "rewards/format_reward": 0.6041666809469461, | |
| "step": 214 | |
| }, | |
| { | |
| "completion_length": 1979.6250610351562, | |
| "epoch": 0.24571428571428572, | |
| "grad_norm": 0.03919277340173721, | |
| "kl": 0.00014135241508483887, | |
| "learning_rate": 7.358969934210438e-07, | |
| "loss": 0.0, | |
| "reward": 0.5261248834431171, | |
| "reward_std": 0.8126789703965187, | |
| "rewards/cosine_scaled_reward": -0.09110424015671015, | |
| "rewards/format_reward": 0.7083333469927311, | |
| "step": 215 | |
| }, | |
| { | |
| "completion_length": 1884.812515258789, | |
| "epoch": 0.24685714285714286, | |
| "grad_norm": 0.0325796902179718, | |
| "kl": 0.0001468956470489502, | |
| "learning_rate": 7.330314893841101e-07, | |
| "loss": 0.0, | |
| "reward": 0.7279386296868324, | |
| "reward_std": 0.569173252210021, | |
| "rewards/cosine_scaled_reward": 0.030635960400104523, | |
| "rewards/format_reward": 0.6666666716337204, | |
| "step": 216 | |
| }, | |
| { | |
| "completion_length": 2459.5833435058594, | |
| "epoch": 0.248, | |
| "grad_norm": 0.025857582688331604, | |
| "kl": 0.00016328692436218262, | |
| "learning_rate": 7.301570646506027e-07, | |
| "loss": 0.0, | |
| "reward": 0.7756138246040791, | |
| "reward_std": 0.9014986008405685, | |
| "rewards/cosine_scaled_reward": 0.09614021610468626, | |
| "rewards/format_reward": 0.583333333954215, | |
| "step": 217 | |
| }, | |
| { | |
| "completion_length": 2637.5208740234375, | |
| "epoch": 0.24914285714285714, | |
| "grad_norm": 0.02500126138329506, | |
| "kl": 0.00015026330947875977, | |
| "learning_rate": 7.27273859315928e-07, | |
| "loss": 0.0, | |
| "reward": 0.6237838268280029, | |
| "reward_std": 0.6200420390814543, | |
| "rewards/cosine_scaled_reward": 0.07230857852846384, | |
| "rewards/format_reward": 0.47916667722165585, | |
| "step": 218 | |
| }, | |
| { | |
| "completion_length": 2157.6666870117188, | |
| "epoch": 0.2502857142857143, | |
| "grad_norm": 0.02829655073583126, | |
| "kl": 0.00017982721328735352, | |
| "learning_rate": 7.243820139034464e-07, | |
| "loss": 0.0, | |
| "reward": 1.0598554275929928, | |
| "reward_std": 0.7442222200334072, | |
| "rewards/cosine_scaled_reward": 0.1757610123604536, | |
| "rewards/format_reward": 0.7083333395421505, | |
| "step": 219 | |
| }, | |
| { | |
| "completion_length": 2727.3541946411133, | |
| "epoch": 0.25142857142857145, | |
| "grad_norm": 0.0359327532351017, | |
| "kl": 0.00015404075384140015, | |
| "learning_rate": 7.214816693576234e-07, | |
| "loss": 0.0, | |
| "reward": -0.1477277409285307, | |
| "reward_std": 0.410174666903913, | |
| "rewards/cosine_scaled_reward": -0.2509472072124481, | |
| "rewards/format_reward": 0.35416667722165585, | |
| "step": 220 | |
| }, | |
| { | |
| "completion_length": 1798.3125038146973, | |
| "epoch": 0.25257142857142856, | |
| "grad_norm": 0.02666349895298481, | |
| "kl": 0.00010177493095397949, | |
| "learning_rate": 7.185729670371604e-07, | |
| "loss": 0.0, | |
| "reward": 0.9729608483612537, | |
| "reward_std": 0.3803430050611496, | |
| "rewards/cosine_scaled_reward": 0.13231376186013222, | |
| "rewards/format_reward": 0.7083333358168602, | |
| "step": 221 | |
| }, | |
| { | |
| "completion_length": 1928.5000228881836, | |
| "epoch": 0.2537142857142857, | |
| "grad_norm": 0.02487647347152233, | |
| "kl": 0.00014640390872955322, | |
| "learning_rate": 7.156560487081051e-07, | |
| "loss": 0.0, | |
| "reward": 1.0013896897435188, | |
| "reward_std": 0.6300395149737597, | |
| "rewards/cosine_scaled_reward": 0.14652817510068417, | |
| "rewards/format_reward": 0.708333333954215, | |
| "step": 222 | |
| }, | |
| { | |
| "completion_length": 1702.7708435058594, | |
| "epoch": 0.25485714285714284, | |
| "grad_norm": 0.02845071442425251, | |
| "kl": 0.00010716915130615234, | |
| "learning_rate": 7.127310565369415e-07, | |
| "loss": 0.0, | |
| "reward": 0.9212910756468773, | |
| "reward_std": 0.6102632470428944, | |
| "rewards/cosine_scaled_reward": 0.08564550150185823, | |
| "rewards/format_reward": 0.75, | |
| "step": 223 | |
| }, | |
| { | |
| "completion_length": 2827.3959045410156, | |
| "epoch": 0.256, | |
| "grad_norm": 0.021246662363409996, | |
| "kl": 0.00013241171836853027, | |
| "learning_rate": 7.097981330836616e-07, | |
| "loss": 0.0, | |
| "reward": 0.6548797897994518, | |
| "reward_std": 0.7224124409258366, | |
| "rewards/cosine_scaled_reward": 0.06702320463955402, | |
| "rewards/format_reward": 0.5208333358168602, | |
| "step": 224 | |
| }, | |
| { | |
| "completion_length": 2509.5625228881836, | |
| "epoch": 0.2571428571428571, | |
| "grad_norm": 0.027184953913092613, | |
| "kl": 0.00013971328735351562, | |
| "learning_rate": 7.068574212948169e-07, | |
| "loss": 0.0, | |
| "reward": 0.44291812740266323, | |
| "reward_std": 0.8240737412124872, | |
| "rewards/cosine_scaled_reward": -0.028540948405861855, | |
| "rewards/format_reward": 0.5000000074505806, | |
| "step": 225 | |
| }, | |
| { | |
| "completion_length": 2216.375011444092, | |
| "epoch": 0.2582857142857143, | |
| "grad_norm": 0.026975193992257118, | |
| "kl": 0.00015223026275634766, | |
| "learning_rate": 7.039090644965509e-07, | |
| "loss": 0.0, | |
| "reward": 0.8716886155307293, | |
| "reward_std": 0.5759567692875862, | |
| "rewards/cosine_scaled_reward": 0.15459428541362286, | |
| "rewards/format_reward": 0.5625000074505806, | |
| "step": 226 | |
| }, | |
| { | |
| "completion_length": 1686.8958702087402, | |
| "epoch": 0.25942857142857145, | |
| "grad_norm": 0.03848085552453995, | |
| "kl": 0.00011584162712097168, | |
| "learning_rate": 7.009532063876148e-07, | |
| "loss": 0.0, | |
| "reward": 1.1075771823525429, | |
| "reward_std": 0.913255337625742, | |
| "rewards/cosine_scaled_reward": 0.1475385595113039, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 227 | |
| }, | |
| { | |
| "completion_length": 1939.8958435058594, | |
| "epoch": 0.26057142857142856, | |
| "grad_norm": 0.03354226425290108, | |
| "kl": 0.00010298937559127808, | |
| "learning_rate": 6.979899910323624e-07, | |
| "loss": 0.0, | |
| "reward": 0.8404038224834949, | |
| "reward_std": 0.31906691938638687, | |
| "rewards/cosine_scaled_reward": 0.13895191438496113, | |
| "rewards/format_reward": 0.5625, | |
| "step": 228 | |
| }, | |
| { | |
| "completion_length": 3084.041717529297, | |
| "epoch": 0.26171428571428573, | |
| "grad_norm": 0.02156384289264679, | |
| "kl": 0.00019919872283935547, | |
| "learning_rate": 6.950195628537299e-07, | |
| "loss": 0.0, | |
| "reward": 0.2632536366581917, | |
| "reward_std": 0.7106823846697807, | |
| "rewards/cosine_scaled_reward": -0.02462319377809763, | |
| "rewards/format_reward": 0.3125000037252903, | |
| "step": 229 | |
| }, | |
| { | |
| "completion_length": 2789.5208892822266, | |
| "epoch": 0.26285714285714284, | |
| "grad_norm": 0.022447209805250168, | |
| "kl": 0.0001659989356994629, | |
| "learning_rate": 6.920420666261961e-07, | |
| "loss": 0.0, | |
| "reward": 0.6103229657746851, | |
| "reward_std": 0.9964038021862507, | |
| "rewards/cosine_scaled_reward": 0.034328147768974304, | |
| "rewards/format_reward": 0.5416666734963655, | |
| "step": 230 | |
| }, | |
| { | |
| "completion_length": 2335.125030517578, | |
| "epoch": 0.264, | |
| "grad_norm": 0.02745773270726204, | |
| "kl": 0.00013434886932373047, | |
| "learning_rate": 6.890576474687263e-07, | |
| "loss": 0.0, | |
| "reward": 0.8026296477764845, | |
| "reward_std": 0.6375624313950539, | |
| "rewards/cosine_scaled_reward": 0.08881480386480689, | |
| "rewards/format_reward": 0.6250000018626451, | |
| "step": 231 | |
| }, | |
| { | |
| "completion_length": 2833.791763305664, | |
| "epoch": 0.2651428571428571, | |
| "grad_norm": 0.02423485741019249, | |
| "kl": 0.00019976496696472168, | |
| "learning_rate": 6.860664508377001e-07, | |
| "loss": 0.0, | |
| "reward": 0.7053144201636314, | |
| "reward_std": 0.9186714664101601, | |
| "rewards/cosine_scaled_reward": 0.08182389056310058, | |
| "rewards/format_reward": 0.5416666679084301, | |
| "step": 232 | |
| }, | |
| { | |
| "completion_length": 1837.6666946411133, | |
| "epoch": 0.2662857142857143, | |
| "grad_norm": 0.03410692512989044, | |
| "kl": 0.0001348555088043213, | |
| "learning_rate": 6.83068622519821e-07, | |
| "loss": 0.0, | |
| "reward": 1.0225979089736938, | |
| "reward_std": 1.1002402231097221, | |
| "rewards/cosine_scaled_reward": 0.15713229309767485, | |
| "rewards/format_reward": 0.7083333414047956, | |
| "step": 233 | |
| }, | |
| { | |
| "completion_length": 2508.9791889190674, | |
| "epoch": 0.2674285714285714, | |
| "grad_norm": 0.03137635439634323, | |
| "kl": 0.0001417398452758789, | |
| "learning_rate": 6.800643086250121e-07, | |
| "loss": 0.0, | |
| "reward": 0.4594912249594927, | |
| "reward_std": 0.34390021953731775, | |
| "rewards/cosine_scaled_reward": -0.009837735444307327, | |
| "rewards/format_reward": 0.4791666716337204, | |
| "step": 234 | |
| }, | |
| { | |
| "completion_length": 2293.812526702881, | |
| "epoch": 0.26857142857142857, | |
| "grad_norm": 0.03317144513130188, | |
| "kl": 0.00013634003698825836, | |
| "learning_rate": 6.770536555792944e-07, | |
| "loss": 0.0, | |
| "reward": 0.8775238930247724, | |
| "reward_std": 0.8305205693468451, | |
| "rewards/cosine_scaled_reward": 0.15751194162294269, | |
| "rewards/format_reward": 0.5625000074505806, | |
| "step": 235 | |
| }, | |
| { | |
| "completion_length": 2535.791702270508, | |
| "epoch": 0.26971428571428574, | |
| "grad_norm": 0.022067207843065262, | |
| "kl": 0.0001373887062072754, | |
| "learning_rate": 6.740368101176495e-07, | |
| "loss": 0.0, | |
| "reward": 1.103066761046648, | |
| "reward_std": 0.9327066205441952, | |
| "rewards/cosine_scaled_reward": 0.23903337866067886, | |
| "rewards/format_reward": 0.6250000111758709, | |
| "step": 236 | |
| }, | |
| { | |
| "completion_length": 2465.583335876465, | |
| "epoch": 0.27085714285714285, | |
| "grad_norm": 0.023853298276662827, | |
| "kl": 0.00014454126358032227, | |
| "learning_rate": 6.710139192768694e-07, | |
| "loss": 0.0, | |
| "reward": 0.7351281112059951, | |
| "reward_std": 0.7459862437099218, | |
| "rewards/cosine_scaled_reward": 0.12798071585712023, | |
| "rewards/format_reward": 0.47916667349636555, | |
| "step": 237 | |
| }, | |
| { | |
| "completion_length": 2457.104217529297, | |
| "epoch": 0.272, | |
| "grad_norm": 0.022526247426867485, | |
| "kl": 0.0001602470874786377, | |
| "learning_rate": 6.679851303883891e-07, | |
| "loss": 0.0, | |
| "reward": 1.184366210596636, | |
| "reward_std": 0.9232276640832424, | |
| "rewards/cosine_scaled_reward": 0.2171830991283059, | |
| "rewards/format_reward": 0.7500000111758709, | |
| "step": 238 | |
| }, | |
| { | |
| "completion_length": 1609.354206085205, | |
| "epoch": 0.27314285714285713, | |
| "grad_norm": 0.03424397110939026, | |
| "kl": 8.738785982131958e-05, | |
| "learning_rate": 6.649505910711058e-07, | |
| "loss": 0.0, | |
| "reward": 1.3734241500496864, | |
| "reward_std": 0.5677254348993301, | |
| "rewards/cosine_scaled_reward": 0.30129539873450994, | |
| "rewards/format_reward": 0.7708333395421505, | |
| "step": 239 | |
| }, | |
| { | |
| "completion_length": 2939.1458740234375, | |
| "epoch": 0.2742857142857143, | |
| "grad_norm": 0.028905417770147324, | |
| "kl": 0.00021439790725708008, | |
| "learning_rate": 6.619104492241847e-07, | |
| "loss": 0.0, | |
| "reward": -0.0536014367826283, | |
| "reward_std": 0.5374163910746574, | |
| "rewards/cosine_scaled_reward": -0.21430072654038668, | |
| "rewards/format_reward": 0.37500000558793545, | |
| "step": 240 | |
| }, | |
| { | |
| "completion_length": 2732.6250228881836, | |
| "epoch": 0.2754285714285714, | |
| "grad_norm": 0.02268056571483612, | |
| "kl": 0.00020313262939453125, | |
| "learning_rate": 6.588648530198504e-07, | |
| "loss": 0.0, | |
| "reward": 0.2373307328671217, | |
| "reward_std": 0.628166763111949, | |
| "rewards/cosine_scaled_reward": -0.13133463938720524, | |
| "rewards/format_reward": 0.5000000093132257, | |
| "step": 241 | |
| }, | |
| { | |
| "completion_length": 1986.0000228881836, | |
| "epoch": 0.2765714285714286, | |
| "grad_norm": 0.03629198670387268, | |
| "kl": 0.00025428086519241333, | |
| "learning_rate": 6.558139508961654e-07, | |
| "loss": 0.0, | |
| "reward": 0.5092753138160333, | |
| "reward_std": 0.674851905554533, | |
| "rewards/cosine_scaled_reward": -0.047445693984627724, | |
| "rewards/format_reward": 0.6041666828095913, | |
| "step": 242 | |
| }, | |
| { | |
| "completion_length": 2734.229217529297, | |
| "epoch": 0.2777142857142857, | |
| "grad_norm": 0.021753275766968727, | |
| "kl": 0.00011421740055084229, | |
| "learning_rate": 6.527578915497951e-07, | |
| "loss": 0.0, | |
| "reward": 0.9028994496911764, | |
| "reward_std": 1.0264187045395374, | |
| "rewards/cosine_scaled_reward": 0.2014497071504593, | |
| "rewards/format_reward": 0.5000000111758709, | |
| "step": 243 | |
| }, | |
| { | |
| "completion_length": 2789.687545776367, | |
| "epoch": 0.27885714285714286, | |
| "grad_norm": 0.02406812459230423, | |
| "kl": 0.00016814470291137695, | |
| "learning_rate": 6.496968239287603e-07, | |
| "loss": 0.0, | |
| "reward": 0.5570749193429947, | |
| "reward_std": 1.1161103919148445, | |
| "rewards/cosine_scaled_reward": 0.05978744977619499, | |
| "rewards/format_reward": 0.43750000931322575, | |
| "step": 244 | |
| }, | |
| { | |
| "completion_length": 2515.312568664551, | |
| "epoch": 0.28, | |
| "grad_norm": 0.029088586568832397, | |
| "kl": 0.0001869797706604004, | |
| "learning_rate": 6.466308972251785e-07, | |
| "loss": 0.0, | |
| "reward": 1.0382240321487188, | |
| "reward_std": 1.1980111673474312, | |
| "rewards/cosine_scaled_reward": 0.18577865794941317, | |
| "rewards/format_reward": 0.6666666772216558, | |
| "step": 245 | |
| }, | |
| { | |
| "completion_length": 2277.875045776367, | |
| "epoch": 0.28114285714285714, | |
| "grad_norm": 0.02349865809082985, | |
| "kl": 0.00016105175018310547, | |
| "learning_rate": 6.435602608679916e-07, | |
| "loss": 0.0, | |
| "reward": 0.9948470462113619, | |
| "reward_std": 0.784628352150321, | |
| "rewards/cosine_scaled_reward": 0.13284017331898212, | |
| "rewards/format_reward": 0.7291666679084301, | |
| "step": 246 | |
| }, | |
| { | |
| "completion_length": 2917.416717529297, | |
| "epoch": 0.2822857142857143, | |
| "grad_norm": 0.019329527392983437, | |
| "kl": 0.0001506805419921875, | |
| "learning_rate": 6.404850645156841e-07, | |
| "loss": 0.0, | |
| "reward": 0.3844575481489301, | |
| "reward_std": 0.7838251367211342, | |
| "rewards/cosine_scaled_reward": -0.005687899887561798, | |
| "rewards/format_reward": 0.3958333432674408, | |
| "step": 247 | |
| }, | |
| { | |
| "completion_length": 1923.4375381469727, | |
| "epoch": 0.2834285714285714, | |
| "grad_norm": 0.034683942794799805, | |
| "kl": 0.0001239478588104248, | |
| "learning_rate": 6.374054580489873e-07, | |
| "loss": 0.0, | |
| "reward": 1.2442171084694564, | |
| "reward_std": 0.8035479206591845, | |
| "rewards/cosine_scaled_reward": 0.2887751990929246, | |
| "rewards/format_reward": 0.6666666697710752, | |
| "step": 248 | |
| }, | |
| { | |
| "completion_length": 1708.2916946411133, | |
| "epoch": 0.2845714285714286, | |
| "grad_norm": 0.03058278001844883, | |
| "kl": 0.00011420249938964844, | |
| "learning_rate": 6.343215915635761e-07, | |
| "loss": 0.0, | |
| "reward": 0.7377499938011169, | |
| "reward_std": 0.5643534660339355, | |
| "rewards/cosine_scaled_reward": 0.02512497268617153, | |
| "rewards/format_reward": 0.6875, | |
| "step": 249 | |
| }, | |
| { | |
| "completion_length": 2321.4167137145996, | |
| "epoch": 0.2857142857142857, | |
| "grad_norm": 0.02951645292341709, | |
| "kl": 0.00019127130508422852, | |
| "learning_rate": 6.31233615362752e-07, | |
| "loss": 0.0, | |
| "reward": 0.5674178429762833, | |
| "reward_std": 0.8846789188683033, | |
| "rewards/cosine_scaled_reward": -0.04962442675605416, | |
| "rewards/format_reward": 0.6666666772216558, | |
| "step": 250 | |
| }, | |
| { | |
| "completion_length": 1895.1875228881836, | |
| "epoch": 0.28685714285714287, | |
| "grad_norm": 0.02950862981379032, | |
| "kl": 0.0001430809497833252, | |
| "learning_rate": 6.281416799501187e-07, | |
| "loss": 0.0, | |
| "reward": 1.1478381529450417, | |
| "reward_std": 0.9535709973424673, | |
| "rewards/cosine_scaled_reward": 0.17808573693037033, | |
| "rewards/format_reward": 0.7916666772216558, | |
| "step": 251 | |
| }, | |
| { | |
| "completion_length": 2560.3125534057617, | |
| "epoch": 0.288, | |
| "grad_norm": 0.026902787387371063, | |
| "kl": 0.00019630789756774902, | |
| "learning_rate": 6.25045936022246e-07, | |
| "loss": 0.0, | |
| "reward": 0.29892047587782145, | |
| "reward_std": 0.697446895763278, | |
| "rewards/cosine_scaled_reward": -0.09012311231344938, | |
| "rewards/format_reward": 0.47916667349636555, | |
| "step": 252 | |
| }, | |
| { | |
| "completion_length": 2656.145854949951, | |
| "epoch": 0.28914285714285715, | |
| "grad_norm": 0.03214016929268837, | |
| "kl": 0.00017839670181274414, | |
| "learning_rate": 6.219465344613258e-07, | |
| "loss": 0.0, | |
| "reward": 0.3197309598326683, | |
| "reward_std": 0.8267754539847374, | |
| "rewards/cosine_scaled_reward": -0.11096786396956304, | |
| "rewards/format_reward": 0.5416666716337204, | |
| "step": 253 | |
| }, | |
| { | |
| "completion_length": 2232.6666984558105, | |
| "epoch": 0.29028571428571426, | |
| "grad_norm": 0.03257149085402489, | |
| "kl": 0.00014886260032653809, | |
| "learning_rate": 6.188436263278172e-07, | |
| "loss": 0.0, | |
| "reward": 0.8183548748493195, | |
| "reward_std": 0.742323987185955, | |
| "rewards/cosine_scaled_reward": 0.09667741693556309, | |
| "rewards/format_reward": 0.6250000037252903, | |
| "step": 254 | |
| }, | |
| { | |
| "completion_length": 3091.645854949951, | |
| "epoch": 0.2914285714285714, | |
| "grad_norm": 0.02308414690196514, | |
| "kl": 0.00019991397857666016, | |
| "learning_rate": 6.157373628530852e-07, | |
| "loss": 0.0, | |
| "reward": 0.48197503201663494, | |
| "reward_std": 1.0647086016833782, | |
| "rewards/cosine_scaled_reward": 0.03265417801867443, | |
| "rewards/format_reward": 0.41666666977107525, | |
| "step": 255 | |
| }, | |
| { | |
| "completion_length": 2500.2917404174805, | |
| "epoch": 0.2925714285714286, | |
| "grad_norm": 0.025716979056596756, | |
| "kl": 0.0002094060182571411, | |
| "learning_rate": 6.126278954320294e-07, | |
| "loss": 0.0, | |
| "reward": 0.6078239146154374, | |
| "reward_std": 0.9901378192007542, | |
| "rewards/cosine_scaled_reward": 0.01224527694284916, | |
| "rewards/format_reward": 0.5833333469927311, | |
| "step": 256 | |
| }, | |
| { | |
| "completion_length": 2928.791702270508, | |
| "epoch": 0.2937142857142857, | |
| "grad_norm": 0.020812608301639557, | |
| "kl": 0.00019612908363342285, | |
| "learning_rate": 6.095153756157051e-07, | |
| "loss": 0.0, | |
| "reward": 1.0758514516055584, | |
| "reward_std": 0.8948536142706871, | |
| "rewards/cosine_scaled_reward": 0.26709236670285463, | |
| "rewards/format_reward": 0.5416666772216558, | |
| "step": 257 | |
| }, | |
| { | |
| "completion_length": 3086.6875915527344, | |
| "epoch": 0.2948571428571429, | |
| "grad_norm": 0.022219646722078323, | |
| "kl": 0.0002516508102416992, | |
| "learning_rate": 6.06399955103937e-07, | |
| "loss": 0.0, | |
| "reward": 0.5924380738288164, | |
| "reward_std": 0.8064648397266865, | |
| "rewards/cosine_scaled_reward": 0.035802360624074936, | |
| "rewards/format_reward": 0.5208333414047956, | |
| "step": 258 | |
| }, | |
| { | |
| "completion_length": 2551.250026702881, | |
| "epoch": 0.296, | |
| "grad_norm": 0.03364958614110947, | |
| "kl": 0.00019508600234985352, | |
| "learning_rate": 6.032817857379256e-07, | |
| "loss": 0.0, | |
| "reward": 0.6504997052252293, | |
| "reward_std": 0.7922412864863873, | |
| "rewards/cosine_scaled_reward": 0.07524984562769532, | |
| "rewards/format_reward": 0.5000000093132257, | |
| "step": 259 | |
| }, | |
| { | |
| "completion_length": 1881.8541831970215, | |
| "epoch": 0.29714285714285715, | |
| "grad_norm": 0.03499652445316315, | |
| "kl": 0.00013124942779541016, | |
| "learning_rate": 6.001610194928464e-07, | |
| "loss": 0.0, | |
| "reward": 1.0665137059986591, | |
| "reward_std": 0.5399355934932828, | |
| "rewards/cosine_scaled_reward": 0.231173490639776, | |
| "rewards/format_reward": 0.6041666697710752, | |
| "step": 260 | |
| }, | |
| { | |
| "completion_length": 2637.4166870117188, | |
| "epoch": 0.29828571428571427, | |
| "grad_norm": 0.025031670928001404, | |
| "kl": 0.00015434622764587402, | |
| "learning_rate": 5.97037808470444e-07, | |
| "loss": 0.0, | |
| "reward": 0.3631602041423321, | |
| "reward_std": 0.6417339015752077, | |
| "rewards/cosine_scaled_reward": -0.047586577478796244, | |
| "rewards/format_reward": 0.4583333358168602, | |
| "step": 261 | |
| }, | |
| { | |
| "completion_length": 3057.458366394043, | |
| "epoch": 0.29942857142857143, | |
| "grad_norm": 0.022654270753264427, | |
| "kl": 0.00020933151245117188, | |
| "learning_rate": 5.939123048916173e-07, | |
| "loss": 0.0, | |
| "reward": 0.11393738351762295, | |
| "reward_std": 0.7356228828430176, | |
| "rewards/cosine_scaled_reward": -0.10969797242432833, | |
| "rewards/format_reward": 0.3333333395421505, | |
| "step": 262 | |
| }, | |
| { | |
| "completion_length": 2784.5000228881836, | |
| "epoch": 0.30057142857142854, | |
| "grad_norm": 0.027890196070075035, | |
| "kl": 0.00014328956604003906, | |
| "learning_rate": 5.907846610890011e-07, | |
| "loss": 0.0, | |
| "reward": 0.5555706415325403, | |
| "reward_std": 0.6993023809045553, | |
| "rewards/cosine_scaled_reward": 0.05903530679643154, | |
| "rewards/format_reward": 0.4375000037252903, | |
| "step": 263 | |
| }, | |
| { | |
| "completion_length": 2711.916702270508, | |
| "epoch": 0.3017142857142857, | |
| "grad_norm": 0.026740290224552155, | |
| "kl": 0.0001666545867919922, | |
| "learning_rate": 5.87655029499542e-07, | |
| "loss": 0.0, | |
| "reward": 0.5835818080231547, | |
| "reward_std": 0.9015247318893671, | |
| "rewards/cosine_scaled_reward": 0.010540907853282988, | |
| "rewards/format_reward": 0.5625000093132257, | |
| "step": 264 | |
| }, | |
| { | |
| "completion_length": 1977.6875305175781, | |
| "epoch": 0.3028571428571429, | |
| "grad_norm": 0.03000440075993538, | |
| "kl": 0.00016647577285766602, | |
| "learning_rate": 5.845235626570683e-07, | |
| "loss": 0.0, | |
| "reward": 1.0214002854190767, | |
| "reward_std": 0.7950426787137985, | |
| "rewards/cosine_scaled_reward": 0.16695012245327234, | |
| "rewards/format_reward": 0.6875000018626451, | |
| "step": 265 | |
| }, | |
| { | |
| "completion_length": 3163.958354949951, | |
| "epoch": 0.304, | |
| "grad_norm": 0.01840229332447052, | |
| "kl": 0.00019887089729309082, | |
| "learning_rate": 5.813904131848564e-07, | |
| "loss": 0.0, | |
| "reward": -0.22563371248543262, | |
| "reward_std": 0.4246583506464958, | |
| "rewards/cosine_scaled_reward": -0.21698353067040443, | |
| "rewards/format_reward": 0.2083333358168602, | |
| "step": 266 | |
| }, | |
| { | |
| "completion_length": 2904.750045776367, | |
| "epoch": 0.30514285714285716, | |
| "grad_norm": 0.024483945220708847, | |
| "kl": 0.0001894235610961914, | |
| "learning_rate": 5.78255733788191e-07, | |
| "loss": 0.0, | |
| "reward": 0.3056447245180607, | |
| "reward_std": 0.7651229426264763, | |
| "rewards/cosine_scaled_reward": -0.024260970763862133, | |
| "rewards/format_reward": 0.3541666753590107, | |
| "step": 267 | |
| }, | |
| { | |
| "completion_length": 2707.229202270508, | |
| "epoch": 0.3062857142857143, | |
| "grad_norm": 0.033471375703811646, | |
| "kl": 0.0002181529998779297, | |
| "learning_rate": 5.751196772469237e-07, | |
| "loss": 0.0, | |
| "reward": 0.31767203425988555, | |
| "reward_std": 1.0267981998622417, | |
| "rewards/cosine_scaled_reward": -0.07033066404983401, | |
| "rewards/format_reward": 0.4583333469927311, | |
| "step": 268 | |
| }, | |
| { | |
| "completion_length": 2601.145881652832, | |
| "epoch": 0.30742857142857144, | |
| "grad_norm": 0.023748284205794334, | |
| "kl": 0.00017061829566955566, | |
| "learning_rate": 5.71982396408026e-07, | |
| "loss": 0.0, | |
| "reward": 1.1178178992122412, | |
| "reward_std": 0.6580545753240585, | |
| "rewards/cosine_scaled_reward": 0.2568256063386798, | |
| "rewards/format_reward": 0.6041666697710752, | |
| "step": 269 | |
| }, | |
| { | |
| "completion_length": 2286.145851135254, | |
| "epoch": 0.30857142857142855, | |
| "grad_norm": 0.026643814519047737, | |
| "kl": 0.00014588236808776855, | |
| "learning_rate": 5.688440441781398e-07, | |
| "loss": 0.0, | |
| "reward": 1.167864153161645, | |
| "reward_std": 0.851657796651125, | |
| "rewards/cosine_scaled_reward": 0.21934875007718801, | |
| "rewards/format_reward": 0.7291666809469461, | |
| "step": 270 | |
| }, | |
| { | |
| "completion_length": 1784.645866394043, | |
| "epoch": 0.3097142857142857, | |
| "grad_norm": 0.030084380879998207, | |
| "kl": 0.00011153519153594971, | |
| "learning_rate": 5.657047735161255e-07, | |
| "loss": 0.0, | |
| "reward": 1.4007908403873444, | |
| "reward_std": 0.4155320357531309, | |
| "rewards/cosine_scaled_reward": 0.29414540342986584, | |
| "rewards/format_reward": 0.8125, | |
| "step": 271 | |
| }, | |
| { | |
| "completion_length": 2623.0208587646484, | |
| "epoch": 0.31085714285714283, | |
| "grad_norm": 0.02625565230846405, | |
| "kl": 0.00020056962966918945, | |
| "learning_rate": 5.625647374256061e-07, | |
| "loss": 0.0, | |
| "reward": 0.664887816645205, | |
| "reward_std": 1.0364688262343407, | |
| "rewards/cosine_scaled_reward": 0.09286056295968592, | |
| "rewards/format_reward": 0.47916666977107525, | |
| "step": 272 | |
| }, | |
| { | |
| "completion_length": 2523.1875534057617, | |
| "epoch": 0.312, | |
| "grad_norm": 0.02682247757911682, | |
| "kl": 0.00015234388411045074, | |
| "learning_rate": 5.594240889475106e-07, | |
| "loss": 0.0, | |
| "reward": 0.8108466335106641, | |
| "reward_std": 0.6180602237582207, | |
| "rewards/cosine_scaled_reward": 0.1762566501274705, | |
| "rewards/format_reward": 0.45833334140479565, | |
| "step": 273 | |
| }, | |
| { | |
| "completion_length": 1638.1875190734863, | |
| "epoch": 0.31314285714285717, | |
| "grad_norm": 0.035664211958646774, | |
| "kl": 0.0001033395528793335, | |
| "learning_rate": 5.562829811526154e-07, | |
| "loss": 0.0, | |
| "reward": 1.2543680891394615, | |
| "reward_std": 0.6733489837497473, | |
| "rewards/cosine_scaled_reward": 0.21051735104992986, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 274 | |
| }, | |
| { | |
| "completion_length": 2227.7500381469727, | |
| "epoch": 0.3142857142857143, | |
| "grad_norm": 0.024416198953986168, | |
| "kl": 0.00014771521091461182, | |
| "learning_rate": 5.531415671340826e-07, | |
| "loss": 0.0, | |
| "reward": 1.0997284352779388, | |
| "reward_std": 0.6753085609525442, | |
| "rewards/cosine_scaled_reward": 0.21653086133301258, | |
| "rewards/format_reward": 0.6666666697710752, | |
| "step": 275 | |
| }, | |
| { | |
| "completion_length": 2410.375045776367, | |
| "epoch": 0.31542857142857145, | |
| "grad_norm": 0.03159765899181366, | |
| "kl": 0.00018554925918579102, | |
| "learning_rate": 5.5e-07, | |
| "loss": 0.0, | |
| "reward": 1.0730885528028011, | |
| "reward_std": 0.8632807899266481, | |
| "rewards/cosine_scaled_reward": 0.24487759917974472, | |
| "rewards/format_reward": 0.583333333954215, | |
| "step": 276 | |
| }, | |
| { | |
| "completion_length": 2498.7708587646484, | |
| "epoch": 0.31657142857142856, | |
| "grad_norm": 0.024099338799715042, | |
| "kl": 0.00020748376846313477, | |
| "learning_rate": 5.468584328659172e-07, | |
| "loss": 0.0, | |
| "reward": 0.6968964114785194, | |
| "reward_std": 0.8357117287814617, | |
| "rewards/cosine_scaled_reward": 0.09844818594865501, | |
| "rewards/format_reward": 0.5000000037252903, | |
| "step": 277 | |
| }, | |
| { | |
| "completion_length": 2121.0209426879883, | |
| "epoch": 0.3177142857142857, | |
| "grad_norm": 0.032011423259973526, | |
| "kl": 0.00018835067749023438, | |
| "learning_rate": 5.437170188473847e-07, | |
| "loss": 0.0, | |
| "reward": 1.1160824708640575, | |
| "reward_std": 0.7977323867380619, | |
| "rewards/cosine_scaled_reward": 0.17262454610317945, | |
| "rewards/format_reward": 0.770833345130086, | |
| "step": 278 | |
| }, | |
| { | |
| "completion_length": 3179.7708435058594, | |
| "epoch": 0.31885714285714284, | |
| "grad_norm": 0.022884871810674667, | |
| "kl": 0.0002695918083190918, | |
| "learning_rate": 5.405759110524894e-07, | |
| "loss": 0.0, | |
| "reward": 0.020895134657621384, | |
| "reward_std": 0.8185425326228142, | |
| "rewards/cosine_scaled_reward": -0.09371909964829683, | |
| "rewards/format_reward": 0.2083333358168602, | |
| "step": 279 | |
| }, | |
| { | |
| "completion_length": 2095.4583778381348, | |
| "epoch": 0.32, | |
| "grad_norm": 0.03684168681502342, | |
| "kl": 0.0001882314682006836, | |
| "learning_rate": 5.37435262574394e-07, | |
| "loss": 0.0, | |
| "reward": 0.6300602070987225, | |
| "reward_std": 0.8308645784854889, | |
| "rewards/cosine_scaled_reward": -0.007886571809649467, | |
| "rewards/format_reward": 0.6458333395421505, | |
| "step": 280 | |
| }, | |
| { | |
| "completion_length": 3198.8958587646484, | |
| "epoch": 0.3211428571428571, | |
| "grad_norm": 0.021219369024038315, | |
| "kl": 0.00019353628158569336, | |
| "learning_rate": 5.342952264838747e-07, | |
| "loss": 0.0, | |
| "reward": -0.09903884382219985, | |
| "reward_std": 0.7667136080563068, | |
| "rewards/cosine_scaled_reward": -0.20576944231288508, | |
| "rewards/format_reward": 0.31250000186264515, | |
| "step": 281 | |
| }, | |
| { | |
| "completion_length": 2406.458366394043, | |
| "epoch": 0.3222857142857143, | |
| "grad_norm": 0.02495969459414482, | |
| "kl": 0.00018447637557983398, | |
| "learning_rate": 5.311559558218603e-07, | |
| "loss": 0.0, | |
| "reward": 0.7009545154869556, | |
| "reward_std": 0.7124308422207832, | |
| "rewards/cosine_scaled_reward": 0.10047726146876812, | |
| "rewards/format_reward": 0.5000000055879354, | |
| "step": 282 | |
| }, | |
| { | |
| "completion_length": 2349.270896911621, | |
| "epoch": 0.32342857142857145, | |
| "grad_norm": 0.025876298546791077, | |
| "kl": 0.00015360116958618164, | |
| "learning_rate": 5.28017603591974e-07, | |
| "loss": 0.0, | |
| "reward": 1.3722519651055336, | |
| "reward_std": 0.653431911021471, | |
| "rewards/cosine_scaled_reward": 0.34237595461308956, | |
| "rewards/format_reward": 0.687500013038516, | |
| "step": 283 | |
| }, | |
| { | |
| "completion_length": 2050.7708587646484, | |
| "epoch": 0.32457142857142857, | |
| "grad_norm": 0.03173566982150078, | |
| "kl": 0.00013002753257751465, | |
| "learning_rate": 5.248803227530763e-07, | |
| "loss": 0.0, | |
| "reward": 0.6497855484485626, | |
| "reward_std": 0.8422477524727583, | |
| "rewards/cosine_scaled_reward": 0.012392764445394278, | |
| "rewards/format_reward": 0.6250000111758709, | |
| "step": 284 | |
| }, | |
| { | |
| "completion_length": 1926.0208435058594, | |
| "epoch": 0.32571428571428573, | |
| "grad_norm": 0.02867991477251053, | |
| "kl": 0.00016066431999206543, | |
| "learning_rate": 5.21744266211809e-07, | |
| "loss": 0.0, | |
| "reward": 0.4057514422456734, | |
| "reward_std": 0.3730186829343438, | |
| "rewards/cosine_scaled_reward": -0.13045761734247208, | |
| "rewards/format_reward": 0.6666666716337204, | |
| "step": 285 | |
| }, | |
| { | |
| "completion_length": 2342.687530517578, | |
| "epoch": 0.32685714285714285, | |
| "grad_norm": 0.02181842550635338, | |
| "kl": 0.00020366907119750977, | |
| "learning_rate": 5.186095868151436e-07, | |
| "loss": 0.0, | |
| "reward": 0.7377639599144459, | |
| "reward_std": 0.7825022768229246, | |
| "rewards/cosine_scaled_reward": 0.07721533998847008, | |
| "rewards/format_reward": 0.5833333488553762, | |
| "step": 286 | |
| }, | |
| { | |
| "completion_length": 1700.1250076293945, | |
| "epoch": 0.328, | |
| "grad_norm": 0.03624531999230385, | |
| "kl": 0.00015243887901306152, | |
| "learning_rate": 5.154764373429315e-07, | |
| "loss": 0.0, | |
| "reward": 0.8391638435423374, | |
| "reward_std": 0.34983737021684647, | |
| "rewards/cosine_scaled_reward": 0.11749858036637306, | |
| "rewards/format_reward": 0.6041666716337204, | |
| "step": 287 | |
| }, | |
| { | |
| "completion_length": 2821.4167098999023, | |
| "epoch": 0.3291428571428571, | |
| "grad_norm": 0.021799219772219658, | |
| "kl": 0.0002630949020385742, | |
| "learning_rate": 5.123449705004581e-07, | |
| "loss": 0.0, | |
| "reward": 0.45475615188479424, | |
| "reward_std": 0.5843030996620655, | |
| "rewards/cosine_scaled_reward": 0.01904474012553692, | |
| "rewards/format_reward": 0.41666667349636555, | |
| "step": 288 | |
| }, | |
| { | |
| "completion_length": 2083.333351135254, | |
| "epoch": 0.3302857142857143, | |
| "grad_norm": 0.03328165039420128, | |
| "kl": 0.00016808509826660156, | |
| "learning_rate": 5.09215338910999e-07, | |
| "loss": 0.0, | |
| "reward": 0.7187244035303593, | |
| "reward_std": 0.586519805714488, | |
| "rewards/cosine_scaled_reward": 0.0676955422386527, | |
| "rewards/format_reward": 0.5833333414047956, | |
| "step": 289 | |
| }, | |
| { | |
| "completion_length": 1441.3333778381348, | |
| "epoch": 0.3314285714285714, | |
| "grad_norm": 0.041413385421037674, | |
| "kl": 0.0001710951328277588, | |
| "learning_rate": 5.060876951083828e-07, | |
| "loss": 0.0, | |
| "reward": 1.352583286818117, | |
| "reward_std": 0.8790040239691734, | |
| "rewards/cosine_scaled_reward": 0.21795828046742827, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 290 | |
| }, | |
| { | |
| "completion_length": 2162.9167098999023, | |
| "epoch": 0.3325714285714286, | |
| "grad_norm": 0.034360699355602264, | |
| "kl": 0.00018029659986495972, | |
| "learning_rate": 5.02962191529556e-07, | |
| "loss": 0.0, | |
| "reward": 1.1365532241761684, | |
| "reward_std": 0.8855068720877171, | |
| "rewards/cosine_scaled_reward": 0.22452658973634243, | |
| "rewards/format_reward": 0.6875000055879354, | |
| "step": 291 | |
| }, | |
| { | |
| "completion_length": 2644.9583587646484, | |
| "epoch": 0.33371428571428574, | |
| "grad_norm": 0.036109767854213715, | |
| "kl": 0.0002117753028869629, | |
| "learning_rate": 4.998389805071536e-07, | |
| "loss": 0.0, | |
| "reward": 0.4815543070435524, | |
| "reward_std": 0.9521236941218376, | |
| "rewards/cosine_scaled_reward": -0.01963951502693817, | |
| "rewards/format_reward": 0.5208333488553762, | |
| "step": 292 | |
| }, | |
| { | |
| "completion_length": 2065.250030517578, | |
| "epoch": 0.33485714285714285, | |
| "grad_norm": 0.02345646359026432, | |
| "kl": 0.00020043551921844482, | |
| "learning_rate": 4.967182142620745e-07, | |
| "loss": 0.0, | |
| "reward": 0.7920062579214573, | |
| "reward_std": 0.5047565586864948, | |
| "rewards/cosine_scaled_reward": 0.06266978848725557, | |
| "rewards/format_reward": 0.6666666716337204, | |
| "step": 293 | |
| }, | |
| { | |
| "completion_length": 3125.937545776367, | |
| "epoch": 0.336, | |
| "grad_norm": 0.021944746375083923, | |
| "kl": 0.00028936564922332764, | |
| "learning_rate": 4.93600044896063e-07, | |
| "loss": 0.0, | |
| "reward": 0.09450298827141523, | |
| "reward_std": 0.7673522084951401, | |
| "rewards/cosine_scaled_reward": -0.11941516539081931, | |
| "rewards/format_reward": 0.33333334140479565, | |
| "step": 294 | |
| }, | |
| { | |
| "completion_length": 2895.0416870117188, | |
| "epoch": 0.33714285714285713, | |
| "grad_norm": 0.02375158853828907, | |
| "kl": 0.00023639202117919922, | |
| "learning_rate": 4.904846243842949e-07, | |
| "loss": 0.0, | |
| "reward": 0.26469022780656815, | |
| "reward_std": 0.7072394695132971, | |
| "rewards/cosine_scaled_reward": -0.03432156052440405, | |
| "rewards/format_reward": 0.3333333358168602, | |
| "step": 295 | |
| }, | |
| { | |
| "completion_length": 2651.1250610351562, | |
| "epoch": 0.3382857142857143, | |
| "grad_norm": 0.035691626369953156, | |
| "kl": 0.00022995471954345703, | |
| "learning_rate": 4.873721045679706e-07, | |
| "loss": 0.0, | |
| "reward": 0.3209421820938587, | |
| "reward_std": 0.5486197881400585, | |
| "rewards/cosine_scaled_reward": -0.08952891454100609, | |
| "rewards/format_reward": 0.5000000055879354, | |
| "step": 296 | |
| }, | |
| { | |
| "completion_length": 3260.8958740234375, | |
| "epoch": 0.3394285714285714, | |
| "grad_norm": 0.01949848048388958, | |
| "kl": 0.0002536773681640625, | |
| "learning_rate": 4.842626371469149e-07, | |
| "loss": 0.0, | |
| "reward": 0.2345700664445758, | |
| "reward_std": 0.6523050926625729, | |
| "rewards/cosine_scaled_reward": -0.0077149709686636925, | |
| "rewards/format_reward": 0.2500000037252903, | |
| "step": 297 | |
| }, | |
| { | |
| "completion_length": 2739.8333740234375, | |
| "epoch": 0.3405714285714286, | |
| "grad_norm": 0.02277456782758236, | |
| "kl": 0.0001857280731201172, | |
| "learning_rate": 4.811563736721829e-07, | |
| "loss": 0.0, | |
| "reward": 0.43556486954912543, | |
| "reward_std": 0.6449983278289437, | |
| "rewards/cosine_scaled_reward": 0.009449097327888012, | |
| "rewards/format_reward": 0.41666666977107525, | |
| "step": 298 | |
| }, | |
| { | |
| "completion_length": 2909.4791870117188, | |
| "epoch": 0.3417142857142857, | |
| "grad_norm": 0.019031217321753502, | |
| "kl": 0.00021719932556152344, | |
| "learning_rate": 4.780534655386743e-07, | |
| "loss": 0.0, | |
| "reward": 0.6691161692142487, | |
| "reward_std": 0.7011800315231085, | |
| "rewards/cosine_scaled_reward": 0.11580806598067284, | |
| "rewards/format_reward": 0.4375000074505806, | |
| "step": 299 | |
| }, | |
| { | |
| "completion_length": 3504.541717529297, | |
| "epoch": 0.34285714285714286, | |
| "grad_norm": 0.02002292312681675, | |
| "kl": 0.0002887248992919922, | |
| "learning_rate": 4.749540639777539e-07, | |
| "loss": 0.0, | |
| "reward": -0.11457789549604058, | |
| "reward_std": 0.6568729132413864, | |
| "rewards/cosine_scaled_reward": -0.21353895403444767, | |
| "rewards/format_reward": 0.31250000186264515, | |
| "step": 300 | |
| }, | |
| { | |
| "completion_length": 2172.7291870117188, | |
| "epoch": 0.344, | |
| "grad_norm": 0.029103700071573257, | |
| "kl": 0.00020676851272583008, | |
| "learning_rate": 4.7185832004988133e-07, | |
| "loss": 0.0, | |
| "reward": 0.6181785631924868, | |
| "reward_std": 0.7141751162707806, | |
| "rewards/cosine_scaled_reward": -0.045077390037477016, | |
| "rewards/format_reward": 0.7083333469927311, | |
| "step": 301 | |
| }, | |
| { | |
| "completion_length": 2186.4792251586914, | |
| "epoch": 0.34514285714285714, | |
| "grad_norm": 0.03206675127148628, | |
| "kl": 0.00018966197967529297, | |
| "learning_rate": 4.68766384637248e-07, | |
| "loss": 0.0, | |
| "reward": 0.9613751322031021, | |
| "reward_std": 0.7258251551538706, | |
| "rewards/cosine_scaled_reward": 0.14735423121601343, | |
| "rewards/format_reward": 0.666666679084301, | |
| "step": 302 | |
| }, | |
| { | |
| "completion_length": 2475.937545776367, | |
| "epoch": 0.3462857142857143, | |
| "grad_norm": 0.030688228085637093, | |
| "kl": 0.00026735663414001465, | |
| "learning_rate": 4.656784084364238e-07, | |
| "loss": 0.0, | |
| "reward": 0.7674907688051462, | |
| "reward_std": 0.7924170382320881, | |
| "rewards/cosine_scaled_reward": 0.11291203834116459, | |
| "rewards/format_reward": 0.5416666734963655, | |
| "step": 303 | |
| }, | |
| { | |
| "completion_length": 2449.791717529297, | |
| "epoch": 0.3474285714285714, | |
| "grad_norm": 0.024618245661258698, | |
| "kl": 0.00020569562911987305, | |
| "learning_rate": 4.6259454195101267e-07, | |
| "loss": 0.0, | |
| "reward": 0.6075898297131062, | |
| "reward_std": 0.6916676126420498, | |
| "rewards/cosine_scaled_reward": 0.032961574383080006, | |
| "rewards/format_reward": 0.5416666716337204, | |
| "step": 304 | |
| }, | |
| { | |
| "completion_length": 2968.2916870117188, | |
| "epoch": 0.3485714285714286, | |
| "grad_norm": 0.02341613359749317, | |
| "kl": 0.00027740001678466797, | |
| "learning_rate": 4.59514935484316e-07, | |
| "loss": 0.0, | |
| "reward": 0.38752281852066517, | |
| "reward_std": 0.653439100831747, | |
| "rewards/cosine_scaled_reward": -0.024988610297441483, | |
| "rewards/format_reward": 0.43750001303851604, | |
| "step": 305 | |
| }, | |
| { | |
| "completion_length": 2223.770851135254, | |
| "epoch": 0.3497142857142857, | |
| "grad_norm": 0.03363551199436188, | |
| "kl": 0.000197485089302063, | |
| "learning_rate": 4.5643973913200837e-07, | |
| "loss": 0.0, | |
| "reward": 0.603302130009979, | |
| "reward_std": 0.6513868616893888, | |
| "rewards/cosine_scaled_reward": 0.009984390810132027, | |
| "rewards/format_reward": 0.5833333358168602, | |
| "step": 306 | |
| }, | |
| { | |
| "completion_length": 2278.1250495910645, | |
| "epoch": 0.35085714285714287, | |
| "grad_norm": 0.02858709916472435, | |
| "kl": 0.00022980570793151855, | |
| "learning_rate": 4.5336910277482155e-07, | |
| "loss": 0.0, | |
| "reward": 0.6608727481216192, | |
| "reward_std": 0.9192832596600056, | |
| "rewards/cosine_scaled_reward": 0.05960302893072367, | |
| "rewards/format_reward": 0.5416666734963655, | |
| "step": 307 | |
| }, | |
| { | |
| "completion_length": 3100.4583587646484, | |
| "epoch": 0.352, | |
| "grad_norm": 0.020364860072731972, | |
| "kl": 0.0002155900001525879, | |
| "learning_rate": 4.503031760712397e-07, | |
| "loss": 0.0, | |
| "reward": 0.0357023049145937, | |
| "reward_std": 0.8593032034114003, | |
| "rewards/cosine_scaled_reward": -0.10714884800836444, | |
| "rewards/format_reward": 0.2500000037252903, | |
| "step": 308 | |
| }, | |
| { | |
| "completion_length": 2716.1666946411133, | |
| "epoch": 0.35314285714285715, | |
| "grad_norm": 0.021565675735473633, | |
| "kl": 0.00018143653869628906, | |
| "learning_rate": 4.4724210845020494e-07, | |
| "loss": 0.0, | |
| "reward": 0.9010052662342787, | |
| "reward_std": 0.7911521699279547, | |
| "rewards/cosine_scaled_reward": 0.19008595822378993, | |
| "rewards/format_reward": 0.5208333395421505, | |
| "step": 309 | |
| }, | |
| { | |
| "completion_length": 2119.1667098999023, | |
| "epoch": 0.35428571428571426, | |
| "grad_norm": 0.03625396639108658, | |
| "kl": 0.00021964311599731445, | |
| "learning_rate": 4.441860491038345e-07, | |
| "loss": 0.0, | |
| "reward": 0.7360105197876692, | |
| "reward_std": 0.6620105188339949, | |
| "rewards/cosine_scaled_reward": 0.0555052665488347, | |
| "rewards/format_reward": 0.6250000055879354, | |
| "step": 310 | |
| }, | |
| { | |
| "completion_length": 2410.833354949951, | |
| "epoch": 0.3554285714285714, | |
| "grad_norm": 0.02665293589234352, | |
| "kl": 0.00021332502365112305, | |
| "learning_rate": 4.4113514698014953e-07, | |
| "loss": 0.0, | |
| "reward": 0.9606177415698767, | |
| "reward_std": 0.5332410894334316, | |
| "rewards/cosine_scaled_reward": 0.20947551727294922, | |
| "rewards/format_reward": 0.5416666734963655, | |
| "step": 311 | |
| }, | |
| { | |
| "completion_length": 2079.7500228881836, | |
| "epoch": 0.3565714285714286, | |
| "grad_norm": 0.039660558104515076, | |
| "kl": 0.00025030970573425293, | |
| "learning_rate": 4.3808955077581546e-07, | |
| "loss": 0.0, | |
| "reward": 0.9072759561240673, | |
| "reward_std": 0.7400492213200778, | |
| "rewards/cosine_scaled_reward": 0.18280465085990727, | |
| "rewards/format_reward": 0.5416666734963655, | |
| "step": 312 | |
| }, | |
| { | |
| "completion_length": 2780.166778564453, | |
| "epoch": 0.3577142857142857, | |
| "grad_norm": 0.024134520441293716, | |
| "kl": 0.0002815723419189453, | |
| "learning_rate": 4.350494089288943e-07, | |
| "loss": 0.0, | |
| "reward": 0.7847088079433888, | |
| "reward_std": 0.7853051656857133, | |
| "rewards/cosine_scaled_reward": 0.15277107199653983, | |
| "rewards/format_reward": 0.4791666753590107, | |
| "step": 313 | |
| }, | |
| { | |
| "completion_length": 2324.958354949951, | |
| "epoch": 0.3588571428571429, | |
| "grad_norm": 0.03832564502954483, | |
| "kl": 0.00017490237951278687, | |
| "learning_rate": 4.3201486961161093e-07, | |
| "loss": 0.0, | |
| "reward": 0.5692981313914061, | |
| "reward_std": 0.5818128418177366, | |
| "rewards/cosine_scaled_reward": 0.03464904520660639, | |
| "rewards/format_reward": 0.5000000055879354, | |
| "step": 314 | |
| }, | |
| { | |
| "completion_length": 2754.1875534057617, | |
| "epoch": 0.36, | |
| "grad_norm": 0.02288965694606304, | |
| "kl": 0.0002474188804626465, | |
| "learning_rate": 4.2898608072313045e-07, | |
| "loss": 0.0, | |
| "reward": 0.4873616211116314, | |
| "reward_std": 0.7167357690632343, | |
| "rewards/cosine_scaled_reward": 0.03534747939556837, | |
| "rewards/format_reward": 0.41666666977107525, | |
| "step": 315 | |
| }, | |
| { | |
| "completion_length": 3277.7291870117188, | |
| "epoch": 0.36114285714285715, | |
| "grad_norm": 0.021539807319641113, | |
| "kl": 0.0003350973129272461, | |
| "learning_rate": 4.2596318988235037e-07, | |
| "loss": 0.0, | |
| "reward": 0.06327132880687714, | |
| "reward_std": 0.722619965672493, | |
| "rewards/cosine_scaled_reward": -0.09336433373391628, | |
| "rewards/format_reward": 0.25000000931322575, | |
| "step": 316 | |
| }, | |
| { | |
| "completion_length": 2702.958351135254, | |
| "epoch": 0.36228571428571427, | |
| "grad_norm": 0.02301914617419243, | |
| "kl": 0.00023549795150756836, | |
| "learning_rate": 4.2294634442070553e-07, | |
| "loss": 0.0, | |
| "reward": 0.5955107100307941, | |
| "reward_std": 0.7853559423238039, | |
| "rewards/cosine_scaled_reward": 0.06858869083225727, | |
| "rewards/format_reward": 0.4583333395421505, | |
| "step": 317 | |
| }, | |
| { | |
| "completion_length": 1185.270866394043, | |
| "epoch": 0.36342857142857143, | |
| "grad_norm": 0.041411470621824265, | |
| "kl": 0.00015148520469665527, | |
| "learning_rate": 4.1993569137498776e-07, | |
| "loss": 0.0, | |
| "reward": 1.0035362336784601, | |
| "reward_std": 0.6542131118476391, | |
| "rewards/cosine_scaled_reward": 0.0642680861055851, | |
| "rewards/format_reward": 0.8750000055879354, | |
| "step": 318 | |
| }, | |
| { | |
| "completion_length": 2636.291690826416, | |
| "epoch": 0.36457142857142855, | |
| "grad_norm": 0.027852993458509445, | |
| "kl": 0.00036531686782836914, | |
| "learning_rate": 4.1693137748017915e-07, | |
| "loss": 0.0, | |
| "reward": -0.07456635776907206, | |
| "reward_std": 0.6103027909994125, | |
| "rewards/cosine_scaled_reward": -0.23519985377788544, | |
| "rewards/format_reward": 0.39583333767950535, | |
| "step": 319 | |
| }, | |
| { | |
| "completion_length": 1750.3958740234375, | |
| "epoch": 0.3657142857142857, | |
| "grad_norm": 0.040690820664167404, | |
| "kl": 0.00023043155670166016, | |
| "learning_rate": 4.1393354916230005e-07, | |
| "loss": 0.0, | |
| "reward": 0.6239464022219181, | |
| "reward_std": 0.7162524946033955, | |
| "rewards/cosine_scaled_reward": -0.04219348356127739, | |
| "rewards/format_reward": 0.7083333414047956, | |
| "step": 320 | |
| }, | |
| { | |
| "completion_length": 1433.7083702087402, | |
| "epoch": 0.3668571428571429, | |
| "grad_norm": 0.03097819723188877, | |
| "kl": 0.00015997886657714844, | |
| "learning_rate": 4.1094235253127374e-07, | |
| "loss": 0.0, | |
| "reward": 1.5650277510285378, | |
| "reward_std": 0.6967961974442005, | |
| "rewards/cosine_scaled_reward": 0.35543054959271103, | |
| "rewards/format_reward": 0.8541666716337204, | |
| "step": 321 | |
| }, | |
| { | |
| "completion_length": 2536.6250343322754, | |
| "epoch": 0.368, | |
| "grad_norm": 0.035577192902565, | |
| "kl": 0.0003066062927246094, | |
| "learning_rate": 4.079579333738039e-07, | |
| "loss": 0.0, | |
| "reward": 0.3695586025714874, | |
| "reward_std": 0.85175746306777, | |
| "rewards/cosine_scaled_reward": -0.05480404058471322, | |
| "rewards/format_reward": 0.47916666977107525, | |
| "step": 322 | |
| }, | |
| { | |
| "completion_length": 2828.7083587646484, | |
| "epoch": 0.36914285714285716, | |
| "grad_norm": 0.022153332829475403, | |
| "kl": 0.0002874135971069336, | |
| "learning_rate": 4.0498043714627006e-07, | |
| "loss": 0.0, | |
| "reward": 0.35160677476233104, | |
| "reward_std": 0.49565985053777695, | |
| "rewards/cosine_scaled_reward": -0.03252994408831, | |
| "rewards/format_reward": 0.41666666977107525, | |
| "step": 323 | |
| }, | |
| { | |
| "completion_length": 2622.7084045410156, | |
| "epoch": 0.3702857142857143, | |
| "grad_norm": 0.025416357442736626, | |
| "kl": 0.00023853778839111328, | |
| "learning_rate": 4.020100089676376e-07, | |
| "loss": 0.0, | |
| "reward": 0.6543685221113265, | |
| "reward_std": 0.733675017952919, | |
| "rewards/cosine_scaled_reward": 0.01468425802886486, | |
| "rewards/format_reward": 0.6250000074505806, | |
| "step": 324 | |
| }, | |
| { | |
| "completion_length": 2451.312568664551, | |
| "epoch": 0.37142857142857144, | |
| "grad_norm": 0.030104102566838264, | |
| "kl": 0.00023043155670166016, | |
| "learning_rate": 3.9904679361238526e-07, | |
| "loss": 0.0, | |
| "reward": 1.3238378688693047, | |
| "reward_std": 0.8181280642747879, | |
| "rewards/cosine_scaled_reward": 0.3077522525563836, | |
| "rewards/format_reward": 0.7083333469927311, | |
| "step": 325 | |
| }, | |
| { | |
| "completion_length": 2106.645866394043, | |
| "epoch": 0.37257142857142855, | |
| "grad_norm": 0.027758557349443436, | |
| "kl": 0.0002227574586868286, | |
| "learning_rate": 3.9609093550344907e-07, | |
| "loss": 0.0, | |
| "reward": 0.5596632058732212, | |
| "reward_std": 0.41956991609185934, | |
| "rewards/cosine_scaled_reward": -0.01183508150279522, | |
| "rewards/format_reward": 0.583333333954215, | |
| "step": 326 | |
| }, | |
| { | |
| "completion_length": 2574.145851135254, | |
| "epoch": 0.3737142857142857, | |
| "grad_norm": 0.022373398765921593, | |
| "kl": 0.0002663135528564453, | |
| "learning_rate": 3.931425787051832e-07, | |
| "loss": 0.0, | |
| "reward": 0.6893210001289845, | |
| "reward_std": 0.5786230899393559, | |
| "rewards/cosine_scaled_reward": 0.09466049098409712, | |
| "rewards/format_reward": 0.5000000055879354, | |
| "step": 327 | |
| }, | |
| { | |
| "completion_length": 3245.9791870117188, | |
| "epoch": 0.37485714285714283, | |
| "grad_norm": 0.020722128450870514, | |
| "kl": 0.0002732276916503906, | |
| "learning_rate": 3.902018669163384e-07, | |
| "loss": 0.0, | |
| "reward": -0.09898044681176543, | |
| "reward_std": 0.7348660603165627, | |
| "rewards/cosine_scaled_reward": -0.16407354921102524, | |
| "rewards/format_reward": 0.2291666716337204, | |
| "step": 328 | |
| }, | |
| { | |
| "completion_length": 1781.6875381469727, | |
| "epoch": 0.376, | |
| "grad_norm": 0.03358970955014229, | |
| "kl": 0.00023666024208068848, | |
| "learning_rate": 3.872689434630585e-07, | |
| "loss": 0.0, | |
| "reward": 0.9239476323127747, | |
| "reward_std": 0.6006343699991703, | |
| "rewards/cosine_scaled_reward": 0.12864043563604355, | |
| "rewards/format_reward": 0.6666666679084301, | |
| "step": 329 | |
| }, | |
| { | |
| "completion_length": 2251.4166870117188, | |
| "epoch": 0.37714285714285717, | |
| "grad_norm": 0.03271754831075668, | |
| "kl": 0.00033169984817504883, | |
| "learning_rate": 3.843439512918949e-07, | |
| "loss": 0.0, | |
| "reward": 0.4894188493490219, | |
| "reward_std": 0.6065305564552546, | |
| "rewards/cosine_scaled_reward": -0.026123913936316967, | |
| "rewards/format_reward": 0.5416666716337204, | |
| "step": 330 | |
| }, | |
| { | |
| "completion_length": 2500.625026702881, | |
| "epoch": 0.3782857142857143, | |
| "grad_norm": 0.046767521649599075, | |
| "kl": 0.0003387928009033203, | |
| "learning_rate": 3.8142703296283953e-07, | |
| "loss": 0.0, | |
| "reward": 0.19016977865248919, | |
| "reward_std": 0.5361207164824009, | |
| "rewards/cosine_scaled_reward": -0.09241510927677155, | |
| "rewards/format_reward": 0.3750000111758709, | |
| "step": 331 | |
| }, | |
| { | |
| "completion_length": 2354.2917251586914, | |
| "epoch": 0.37942857142857145, | |
| "grad_norm": 0.025751523673534393, | |
| "kl": 0.00024452805519104004, | |
| "learning_rate": 3.785183306423767e-07, | |
| "loss": 0.0, | |
| "reward": 0.5813850574195385, | |
| "reward_std": 0.7453097756952047, | |
| "rewards/cosine_scaled_reward": -0.03222414234187454, | |
| "rewards/format_reward": 0.6458333414047956, | |
| "step": 332 | |
| }, | |
| { | |
| "completion_length": 1749.708366394043, | |
| "epoch": 0.38057142857142856, | |
| "grad_norm": 0.030186345800757408, | |
| "kl": 0.00021500885486602783, | |
| "learning_rate": 3.7561798609655373e-07, | |
| "loss": 0.0, | |
| "reward": 0.9438134930096567, | |
| "reward_std": 0.7434254898689687, | |
| "rewards/cosine_scaled_reward": 0.08649008721113205, | |
| "rewards/format_reward": 0.7708333488553762, | |
| "step": 333 | |
| }, | |
| { | |
| "completion_length": 3011.7916870117188, | |
| "epoch": 0.38171428571428573, | |
| "grad_norm": 0.020333390682935715, | |
| "kl": 0.0002646446228027344, | |
| "learning_rate": 3.72726140684072e-07, | |
| "loss": 0.0, | |
| "reward": 0.013087651343084872, | |
| "reward_std": 0.8183173164725304, | |
| "rewards/cosine_scaled_reward": -0.19137283554300666, | |
| "rewards/format_reward": 0.3958333432674408, | |
| "step": 334 | |
| }, | |
| { | |
| "completion_length": 2112.37508392334, | |
| "epoch": 0.38285714285714284, | |
| "grad_norm": 0.025975586846470833, | |
| "kl": 0.00021225214004516602, | |
| "learning_rate": 3.6984293534939737e-07, | |
| "loss": 0.0, | |
| "reward": 1.0843083523213863, | |
| "reward_std": 0.8515229849144816, | |
| "rewards/cosine_scaled_reward": 0.19840414822101593, | |
| "rewards/format_reward": 0.6875000037252903, | |
| "step": 335 | |
| }, | |
| { | |
| "completion_length": 2667.9583892822266, | |
| "epoch": 0.384, | |
| "grad_norm": 0.029666097834706306, | |
| "kl": 0.0002618134021759033, | |
| "learning_rate": 3.6696851061588994e-07, | |
| "loss": 0.0, | |
| "reward": 0.6127568809315562, | |
| "reward_std": 0.935376051813364, | |
| "rewards/cosine_scaled_reward": 0.045961756026372313, | |
| "rewards/format_reward": 0.5208333469927311, | |
| "step": 336 | |
| }, | |
| { | |
| "completion_length": 2586.4583587646484, | |
| "epoch": 0.3851428571428571, | |
| "grad_norm": 0.03280950337648392, | |
| "kl": 0.00030684471130371094, | |
| "learning_rate": 3.641030065789562e-07, | |
| "loss": 0.0, | |
| "reward": 0.4611587089020759, | |
| "reward_std": 0.941901870071888, | |
| "rewards/cosine_scaled_reward": -0.029837319627404213, | |
| "rewards/format_reward": 0.5208333488553762, | |
| "step": 337 | |
| }, | |
| { | |
| "completion_length": 1920.6250457763672, | |
| "epoch": 0.3862857142857143, | |
| "grad_norm": 0.03358282893896103, | |
| "kl": 0.00031685829162597656, | |
| "learning_rate": 3.612465628992203e-07, | |
| "loss": 0.0, | |
| "reward": 1.1313491351902485, | |
| "reward_std": 1.0058028288185596, | |
| "rewards/cosine_scaled_reward": 0.2115078903734684, | |
| "rewards/format_reward": 0.7083333507180214, | |
| "step": 338 | |
| }, | |
| { | |
| "completion_length": 2708.6042098999023, | |
| "epoch": 0.38742857142857146, | |
| "grad_norm": 0.023695282638072968, | |
| "kl": 0.00026488304138183594, | |
| "learning_rate": 3.5839931879571725e-07, | |
| "loss": 0.0, | |
| "reward": 0.38326615560799837, | |
| "reward_std": 0.6079130750149488, | |
| "rewards/cosine_scaled_reward": -0.0375335980206728, | |
| "rewards/format_reward": 0.4583333395421505, | |
| "step": 339 | |
| }, | |
| { | |
| "completion_length": 2371.625030517578, | |
| "epoch": 0.38857142857142857, | |
| "grad_norm": 0.025804683566093445, | |
| "kl": 0.00025916099548339844, | |
| "learning_rate": 3.555614130391079e-07, | |
| "loss": 0.0, | |
| "reward": 0.6570014208555222, | |
| "reward_std": 0.9428398255258799, | |
| "rewards/cosine_scaled_reward": 0.026417370419949293, | |
| "rewards/format_reward": 0.6041666734963655, | |
| "step": 340 | |
| }, | |
| { | |
| "completion_length": 2184.333381652832, | |
| "epoch": 0.38971428571428574, | |
| "grad_norm": 0.018583592027425766, | |
| "kl": 0.00025847554206848145, | |
| "learning_rate": 3.5273298394491515e-07, | |
| "loss": 0.0, | |
| "reward": 1.163216508924961, | |
| "reward_std": 0.36735112965106964, | |
| "rewards/cosine_scaled_reward": 0.21702490840107203, | |
| "rewards/format_reward": 0.7291666716337204, | |
| "step": 341 | |
| }, | |
| { | |
| "completion_length": 2734.500045776367, | |
| "epoch": 0.39085714285714285, | |
| "grad_norm": 0.030387967824935913, | |
| "kl": 0.00031244754791259766, | |
| "learning_rate": 3.4991416936678276e-07, | |
| "loss": 0.0, | |
| "reward": 0.5118438927456737, | |
| "reward_std": 0.9676601774990559, | |
| "rewards/cosine_scaled_reward": 0.037171950563788414, | |
| "rewards/format_reward": 0.43750000186264515, | |
| "step": 342 | |
| }, | |
| { | |
| "completion_length": 2897.3959045410156, | |
| "epoch": 0.392, | |
| "grad_norm": 0.018177591264247894, | |
| "kl": 0.00023752450942993164, | |
| "learning_rate": 3.471051066897562e-07, | |
| "loss": 0.0, | |
| "reward": 0.8089198600500822, | |
| "reward_std": 1.0615319572389126, | |
| "rewards/cosine_scaled_reward": 0.13362658489495516, | |
| "rewards/format_reward": 0.541666679084301, | |
| "step": 343 | |
| }, | |
| { | |
| "completion_length": 2030.3333549499512, | |
| "epoch": 0.3931428571428571, | |
| "grad_norm": 0.02531350590288639, | |
| "kl": 0.00022709369659423828, | |
| "learning_rate": 3.4430593282358777e-07, | |
| "loss": 0.0, | |
| "reward": 1.0889792470261455, | |
| "reward_std": 0.7364504262804985, | |
| "rewards/cosine_scaled_reward": 0.20073960922309197, | |
| "rewards/format_reward": 0.6875000018626451, | |
| "step": 344 | |
| }, | |
| { | |
| "completion_length": 2663.0833587646484, | |
| "epoch": 0.3942857142857143, | |
| "grad_norm": 0.0251361895352602, | |
| "kl": 0.0002938807010650635, | |
| "learning_rate": 3.4151678419606233e-07, | |
| "loss": 0.0, | |
| "reward": 0.43384309159591794, | |
| "reward_std": 0.49294498562812805, | |
| "rewards/cosine_scaled_reward": -0.012245113030076027, | |
| "rewards/format_reward": 0.4583333395421505, | |
| "step": 345 | |
| }, | |
| { | |
| "completion_length": 3057.0625610351562, | |
| "epoch": 0.3954285714285714, | |
| "grad_norm": 0.01870022341609001, | |
| "kl": 0.00028312206268310547, | |
| "learning_rate": 3.387377967463493e-07, | |
| "loss": 0.0, | |
| "reward": 0.3562997840344906, | |
| "reward_std": 0.6700815781950951, | |
| "rewards/cosine_scaled_reward": -0.071850111708045, | |
| "rewards/format_reward": 0.5000000018626451, | |
| "step": 346 | |
| }, | |
| { | |
| "completion_length": 2926.7708740234375, | |
| "epoch": 0.3965714285714286, | |
| "grad_norm": 0.019071895629167557, | |
| "kl": 0.0002764463424682617, | |
| "learning_rate": 3.359691059183761e-07, | |
| "loss": 0.0, | |
| "reward": 0.21276836469769478, | |
| "reward_std": 0.5730974823236465, | |
| "rewards/cosine_scaled_reward": -0.12278249301016331, | |
| "rewards/format_reward": 0.4583333358168602, | |
| "step": 347 | |
| }, | |
| { | |
| "completion_length": 2656.3958435058594, | |
| "epoch": 0.3977142857142857, | |
| "grad_norm": 0.03188029304146767, | |
| "kl": 0.0002663731575012207, | |
| "learning_rate": 3.3321084665422803e-07, | |
| "loss": 0.0, | |
| "reward": 0.4368863687850535, | |
| "reward_std": 0.4383883699774742, | |
| "rewards/cosine_scaled_reward": -0.02114013209939003, | |
| "rewards/format_reward": 0.4791666679084301, | |
| "step": 348 | |
| }, | |
| { | |
| "completion_length": 3118.750030517578, | |
| "epoch": 0.39885714285714285, | |
| "grad_norm": 0.020537259057164192, | |
| "kl": 0.0003848075866699219, | |
| "learning_rate": 3.3046315338757026e-07, | |
| "loss": 0.0, | |
| "reward": 0.15319344215095043, | |
| "reward_std": 0.5101428851485252, | |
| "rewards/cosine_scaled_reward": -0.07965327124111354, | |
| "rewards/format_reward": 0.31250000186264515, | |
| "step": 349 | |
| }, | |
| { | |
| "completion_length": 2212.3750381469727, | |
| "epoch": 0.4, | |
| "grad_norm": 0.03013945370912552, | |
| "kl": 0.0002713203430175781, | |
| "learning_rate": 3.2772616003709616e-07, | |
| "loss": 0.0, | |
| "reward": 0.9151125885546207, | |
| "reward_std": 0.895346611738205, | |
| "rewards/cosine_scaled_reward": 0.14505626424215734, | |
| "rewards/format_reward": 0.6250000074505806, | |
| "step": 350 | |
| }, | |
| { | |
| "completion_length": 2579.4375381469727, | |
| "epoch": 0.40114285714285713, | |
| "grad_norm": 0.021040596067905426, | |
| "kl": 0.0002969503402709961, | |
| "learning_rate": 3.250000000000001e-07, | |
| "loss": 0.0, | |
| "reward": 0.5091639803722501, | |
| "reward_std": 0.6855394951999187, | |
| "rewards/cosine_scaled_reward": 0.004581989720463753, | |
| "rewards/format_reward": 0.5000000018626451, | |
| "step": 351 | |
| }, | |
| { | |
| "completion_length": 2121.1458740234375, | |
| "epoch": 0.4022857142857143, | |
| "grad_norm": 0.02947075478732586, | |
| "kl": 0.0002119988203048706, | |
| "learning_rate": 3.222848061454764e-07, | |
| "loss": 0.0, | |
| "reward": 0.7395412735641003, | |
| "reward_std": 0.7547396682202816, | |
| "rewards/cosine_scaled_reward": 0.046853944193571806, | |
| "rewards/format_reward": 0.6458333432674408, | |
| "step": 352 | |
| }, | |
| { | |
| "completion_length": 2014.5833435058594, | |
| "epoch": 0.4034285714285714, | |
| "grad_norm": 0.03271789103746414, | |
| "kl": 0.0002213120460510254, | |
| "learning_rate": 3.195807108082429e-07, | |
| "loss": 0.0, | |
| "reward": 1.2261908203363419, | |
| "reward_std": 0.7751676780171692, | |
| "rewards/cosine_scaled_reward": 0.25892873853445053, | |
| "rewards/format_reward": 0.7083333414047956, | |
| "step": 353 | |
| }, | |
| { | |
| "completion_length": 1930.916706085205, | |
| "epoch": 0.4045714285714286, | |
| "grad_norm": 0.03464234620332718, | |
| "kl": 0.00023859739303588867, | |
| "learning_rate": 3.168878457820915e-07, | |
| "loss": 0.0, | |
| "reward": 0.788333578966558, | |
| "reward_std": 0.7727246461436152, | |
| "rewards/cosine_scaled_reward": 0.060833441093564034, | |
| "rewards/format_reward": 0.6666666734963655, | |
| "step": 354 | |
| }, | |
| { | |
| "completion_length": 2235.229217529297, | |
| "epoch": 0.4057142857142857, | |
| "grad_norm": 0.02619633823633194, | |
| "kl": 0.0002925395965576172, | |
| "learning_rate": 3.142063423134644e-07, | |
| "loss": 0.0, | |
| "reward": 0.9775698743760586, | |
| "reward_std": 1.000068573281169, | |
| "rewards/cosine_scaled_reward": 0.18670159205794334, | |
| "rewards/format_reward": 0.6041666716337204, | |
| "step": 355 | |
| }, | |
| { | |
| "completion_length": 2525.604232788086, | |
| "epoch": 0.40685714285714286, | |
| "grad_norm": 0.022776400670409203, | |
| "kl": 0.00028067827224731445, | |
| "learning_rate": 3.115363310950578e-07, | |
| "loss": 0.0, | |
| "reward": 1.0638019368052483, | |
| "reward_std": 0.8996341824531555, | |
| "rewards/cosine_scaled_reward": 0.17773428186774254, | |
| "rewards/format_reward": 0.7083333358168602, | |
| "step": 356 | |
| }, | |
| { | |
| "completion_length": 2803.0208854675293, | |
| "epoch": 0.408, | |
| "grad_norm": 0.02899840474128723, | |
| "kl": 0.0002684593200683594, | |
| "learning_rate": 3.0887794225945143e-07, | |
| "loss": 0.0, | |
| "reward": 0.3746182285249233, | |
| "reward_std": 0.6245617978274822, | |
| "rewards/cosine_scaled_reward": -0.03144089970737696, | |
| "rewards/format_reward": 0.4375000074505806, | |
| "step": 357 | |
| }, | |
| { | |
| "completion_length": 2429.312530517578, | |
| "epoch": 0.40914285714285714, | |
| "grad_norm": 0.022451486438512802, | |
| "kl": 0.00027573108673095703, | |
| "learning_rate": 3.062313053727671e-07, | |
| "loss": 0.0, | |
| "reward": 1.352016813121736, | |
| "reward_std": 0.6039175763726234, | |
| "rewards/cosine_scaled_reward": 0.31142502650618553, | |
| "rewards/format_reward": 0.7291666679084301, | |
| "step": 358 | |
| }, | |
| { | |
| "completion_length": 2191.812515258789, | |
| "epoch": 0.4102857142857143, | |
| "grad_norm": 0.03193458169698715, | |
| "kl": 0.00021660327911376953, | |
| "learning_rate": 3.0359654942835247e-07, | |
| "loss": 0.0, | |
| "reward": 0.7233314402401447, | |
| "reward_std": 0.7219348400831223, | |
| "rewards/cosine_scaled_reward": 0.059582391288131475, | |
| "rewards/format_reward": 0.6041666716337204, | |
| "step": 359 | |
| }, | |
| { | |
| "completion_length": 2400.979217529297, | |
| "epoch": 0.4114285714285714, | |
| "grad_norm": 0.02467941865324974, | |
| "kl": 0.00025135278701782227, | |
| "learning_rate": 3.0097380284049523e-07, | |
| "loss": 0.0, | |
| "reward": 0.9745171531103551, | |
| "reward_std": 0.8171700052917004, | |
| "rewards/cosine_scaled_reward": 0.15392523724585772, | |
| "rewards/format_reward": 0.666666679084301, | |
| "step": 360 | |
| }, | |
| { | |
| "completion_length": 2900.041732788086, | |
| "epoch": 0.4125714285714286, | |
| "grad_norm": 0.02299378626048565, | |
| "kl": 0.0003606081008911133, | |
| "learning_rate": 2.9836319343816397e-07, | |
| "loss": 0.0, | |
| "reward": 0.8722579134628177, | |
| "reward_std": 0.942744567990303, | |
| "rewards/cosine_scaled_reward": 0.17571225319989026, | |
| "rewards/format_reward": 0.520833345130086, | |
| "step": 361 | |
| }, | |
| { | |
| "completion_length": 1323.479190826416, | |
| "epoch": 0.4137142857142857, | |
| "grad_norm": 0.03528214618563652, | |
| "kl": 0.000218123197555542, | |
| "learning_rate": 2.9576484845877793e-07, | |
| "loss": 0.0, | |
| "reward": 1.2730410918593407, | |
| "reward_std": 0.7479732874780893, | |
| "rewards/cosine_scaled_reward": 0.19902052055113018, | |
| "rewards/format_reward": 0.8750000074505806, | |
| "step": 362 | |
| }, | |
| { | |
| "completion_length": 1805.2916793823242, | |
| "epoch": 0.41485714285714287, | |
| "grad_norm": 0.029074430465698242, | |
| "kl": 0.00025790929794311523, | |
| "learning_rate": 2.931788945420058e-07, | |
| "loss": 0.0, | |
| "reward": 1.1047739461064339, | |
| "reward_std": 0.6205479633063078, | |
| "rewards/cosine_scaled_reward": 0.1982202921062708, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 363 | |
| }, | |
| { | |
| "completion_length": 2740.7083587646484, | |
| "epoch": 0.416, | |
| "grad_norm": 0.021566243842244148, | |
| "kl": 0.000292360782623291, | |
| "learning_rate": 2.9060545772359305e-07, | |
| "loss": 0.0, | |
| "reward": 0.36955657228827477, | |
| "reward_std": 0.7109892182052135, | |
| "rewards/cosine_scaled_reward": -0.05480505700688809, | |
| "rewards/format_reward": 0.479166679084301, | |
| "step": 364 | |
| }, | |
| { | |
| "completion_length": 2874.7500534057617, | |
| "epoch": 0.41714285714285715, | |
| "grad_norm": 0.023213258013129234, | |
| "kl": 0.00028192996978759766, | |
| "learning_rate": 2.8804466342921987e-07, | |
| "loss": 0.0, | |
| "reward": 0.25813389755785465, | |
| "reward_std": 0.7393531780689955, | |
| "rewards/cosine_scaled_reward": -0.048016379587352276, | |
| "rewards/format_reward": 0.3541666828095913, | |
| "step": 365 | |
| }, | |
| { | |
| "completion_length": 1737.541675567627, | |
| "epoch": 0.41828571428571426, | |
| "grad_norm": 0.03312158212065697, | |
| "kl": 0.00028526782989501953, | |
| "learning_rate": 2.854966364683872e-07, | |
| "loss": 0.0, | |
| "reward": 1.2915470600128174, | |
| "reward_std": 0.7252576723694801, | |
| "rewards/cosine_scaled_reward": 0.27077352383639663, | |
| "rewards/format_reward": 0.75, | |
| "step": 366 | |
| }, | |
| { | |
| "completion_length": 2430.625045776367, | |
| "epoch": 0.41942857142857143, | |
| "grad_norm": 0.023319371044635773, | |
| "kl": 0.00021529197692871094, | |
| "learning_rate": 2.829615010283344e-07, | |
| "loss": 0.0, | |
| "reward": 0.817657120525837, | |
| "reward_std": 0.5943502280861139, | |
| "rewards/cosine_scaled_reward": 0.09632855094969273, | |
| "rewards/format_reward": 0.6250000037252903, | |
| "step": 367 | |
| }, | |
| { | |
| "completion_length": 3034.270835876465, | |
| "epoch": 0.4205714285714286, | |
| "grad_norm": 0.026619471609592438, | |
| "kl": 0.0003840923309326172, | |
| "learning_rate": 2.8043938066798645e-07, | |
| "loss": 0.0, | |
| "reward": 0.38300488237291574, | |
| "reward_std": 0.5576745513826609, | |
| "rewards/cosine_scaled_reward": 0.024835781194269657, | |
| "rewards/format_reward": 0.3333333395421505, | |
| "step": 368 | |
| }, | |
| { | |
| "completion_length": 2265.8334045410156, | |
| "epoch": 0.4217142857142857, | |
| "grad_norm": 0.02840518392622471, | |
| "kl": 0.00026166439056396484, | |
| "learning_rate": 2.7793039831193133e-07, | |
| "loss": 0.0, | |
| "reward": 0.9877472408115864, | |
| "reward_std": 0.911319050937891, | |
| "rewards/cosine_scaled_reward": 0.18137363530695438, | |
| "rewards/format_reward": 0.6250000111758709, | |
| "step": 369 | |
| }, | |
| { | |
| "completion_length": 3168.2916717529297, | |
| "epoch": 0.4228571428571429, | |
| "grad_norm": 0.02979353815317154, | |
| "kl": 0.00041425228118896484, | |
| "learning_rate": 2.7543467624442956e-07, | |
| "loss": 0.0, | |
| "reward": 0.3131989259272814, | |
| "reward_std": 0.9070627521723509, | |
| "rewards/cosine_scaled_reward": 0.03159945458173752, | |
| "rewards/format_reward": 0.2500000037252903, | |
| "step": 370 | |
| }, | |
| { | |
| "completion_length": 1747.312515258789, | |
| "epoch": 0.424, | |
| "grad_norm": 0.04161190241575241, | |
| "kl": 0.00025528669357299805, | |
| "learning_rate": 2.729523361034538e-07, | |
| "loss": 0.0, | |
| "reward": 0.7982922215014696, | |
| "reward_std": 0.31508788373321295, | |
| "rewards/cosine_scaled_reward": 0.0762293990701437, | |
| "rewards/format_reward": 0.645833333954215, | |
| "step": 371 | |
| }, | |
| { | |
| "completion_length": 2756.2708587646484, | |
| "epoch": 0.42514285714285716, | |
| "grad_norm": 0.02483726479113102, | |
| "kl": 0.0003197193145751953, | |
| "learning_rate": 2.7048349887476037e-07, | |
| "loss": 0.0, | |
| "reward": 1.0121797006577253, | |
| "reward_std": 0.9594811573624611, | |
| "rewards/cosine_scaled_reward": 0.2248398419469595, | |
| "rewards/format_reward": 0.5625000093132257, | |
| "step": 372 | |
| }, | |
| { | |
| "completion_length": 1751.770866394043, | |
| "epoch": 0.42628571428571427, | |
| "grad_norm": 0.03236696869134903, | |
| "kl": 0.00019246339797973633, | |
| "learning_rate": 2.6802828488599294e-07, | |
| "loss": 0.0, | |
| "reward": 0.8297730721533298, | |
| "reward_std": 0.8522606380283833, | |
| "rewards/cosine_scaled_reward": 0.039886531041702256, | |
| "rewards/format_reward": 0.7500000037252903, | |
| "step": 373 | |
| }, | |
| { | |
| "completion_length": 1826.958351135254, | |
| "epoch": 0.42742857142857144, | |
| "grad_norm": 0.038021981716156006, | |
| "kl": 0.0002339780330657959, | |
| "learning_rate": 2.655868138008171e-07, | |
| "loss": 0.0, | |
| "reward": 0.8999335905537009, | |
| "reward_std": 0.7978312708437443, | |
| "rewards/cosine_scaled_reward": 0.08538344036787748, | |
| "rewards/format_reward": 0.729166679084301, | |
| "step": 374 | |
| }, | |
| { | |
| "completion_length": 2728.250015258789, | |
| "epoch": 0.42857142857142855, | |
| "grad_norm": 0.025368480011820793, | |
| "kl": 0.0003457069396972656, | |
| "learning_rate": 2.631592046130896e-07, | |
| "loss": 0.0, | |
| "reward": 0.6862165480852127, | |
| "reward_std": 0.7109800316393375, | |
| "rewards/cosine_scaled_reward": 0.12435825169086456, | |
| "rewards/format_reward": 0.4375000074505806, | |
| "step": 375 | |
| }, | |
| { | |
| "completion_length": 2175.3333587646484, | |
| "epoch": 0.4297142857142857, | |
| "grad_norm": 0.0324525311589241, | |
| "kl": 0.0003424808382987976, | |
| "learning_rate": 2.6074557564105724e-07, | |
| "loss": 0.0, | |
| "reward": 0.5297784507274628, | |
| "reward_std": 0.4516606852412224, | |
| "rewards/cosine_scaled_reward": -0.05802745930850506, | |
| "rewards/format_reward": 0.645833333954215, | |
| "step": 376 | |
| }, | |
| { | |
| "completion_length": 3404.8750915527344, | |
| "epoch": 0.4308571428571429, | |
| "grad_norm": 0.019506795331835747, | |
| "kl": 0.00038254261016845703, | |
| "learning_rate": 2.583460445215911e-07, | |
| "loss": 0.0, | |
| "reward": 0.21527918288484216, | |
| "reward_std": 0.9994356147944927, | |
| "rewards/cosine_scaled_reward": -0.05902708321809769, | |
| "rewards/format_reward": 0.33333333767950535, | |
| "step": 377 | |
| }, | |
| { | |
| "completion_length": 1961.4167003631592, | |
| "epoch": 0.432, | |
| "grad_norm": 0.039375122636556625, | |
| "kl": 0.00024241209030151367, | |
| "learning_rate": 2.5596072820445254e-07, | |
| "loss": 0.0, | |
| "reward": 0.893386272713542, | |
| "reward_std": 0.8861826229840517, | |
| "rewards/cosine_scaled_reward": 0.10294315160717815, | |
| "rewards/format_reward": 0.687500013038516, | |
| "step": 378 | |
| }, | |
| { | |
| "completion_length": 3086.8542098999023, | |
| "epoch": 0.43314285714285716, | |
| "grad_norm": 0.022253919392824173, | |
| "kl": 0.0003380775451660156, | |
| "learning_rate": 2.5358974294659373e-07, | |
| "loss": 0.0, | |
| "reward": 0.2040771245956421, | |
| "reward_std": 0.7313600815832615, | |
| "rewards/cosine_scaled_reward": -0.033378101885318756, | |
| "rewards/format_reward": 0.27083333767950535, | |
| "step": 379 | |
| }, | |
| { | |
| "completion_length": 2130.1666870117188, | |
| "epoch": 0.4342857142857143, | |
| "grad_norm": 0.031271472573280334, | |
| "kl": 0.0002847909927368164, | |
| "learning_rate": 2.512332043064913e-07, | |
| "loss": 0.0, | |
| "reward": 1.0673715379089117, | |
| "reward_std": 0.6758305430412292, | |
| "rewards/cosine_scaled_reward": 0.16910243220627308, | |
| "rewards/format_reward": 0.7291666772216558, | |
| "step": 380 | |
| }, | |
| { | |
| "completion_length": 2840.5209197998047, | |
| "epoch": 0.43542857142857144, | |
| "grad_norm": 0.024770023301243782, | |
| "kl": 0.00039768218994140625, | |
| "learning_rate": 2.488912271385139e-07, | |
| "loss": 0.0, | |
| "reward": 0.8948189206421375, | |
| "reward_std": 1.0691867098212242, | |
| "rewards/cosine_scaled_reward": 0.15574277006089687, | |
| "rewards/format_reward": 0.5833333507180214, | |
| "step": 381 | |
| }, | |
| { | |
| "completion_length": 1638.2917022705078, | |
| "epoch": 0.43657142857142855, | |
| "grad_norm": 0.028202086687088013, | |
| "kl": 0.00019669532775878906, | |
| "learning_rate": 2.465639255873246e-07, | |
| "loss": 0.0, | |
| "reward": 0.4687234200537205, | |
| "reward_std": 0.6315413638949394, | |
| "rewards/cosine_scaled_reward": -0.1718882955610752, | |
| "rewards/format_reward": 0.8125000074505806, | |
| "step": 382 | |
| }, | |
| { | |
| "completion_length": 2525.875030517578, | |
| "epoch": 0.4377142857142857, | |
| "grad_norm": 0.028156043961644173, | |
| "kl": 0.0003873109817504883, | |
| "learning_rate": 2.4425141308231765e-07, | |
| "loss": 0.0, | |
| "reward": 0.6626773066818714, | |
| "reward_std": 0.6682372465729713, | |
| "rewards/cosine_scaled_reward": 0.06050530215725303, | |
| "rewards/format_reward": 0.5416666679084301, | |
| "step": 383 | |
| }, | |
| { | |
| "completion_length": 2035.6875305175781, | |
| "epoch": 0.43885714285714283, | |
| "grad_norm": 0.0351465679705143, | |
| "kl": 0.0002747178077697754, | |
| "learning_rate": 2.4195380233209006e-07, | |
| "loss": 0.0, | |
| "reward": 1.3732114508748055, | |
| "reward_std": 1.1746384501457214, | |
| "rewards/cosine_scaled_reward": 0.3428557086735964, | |
| "rewards/format_reward": 0.6875000149011612, | |
| "step": 384 | |
| }, | |
| { | |
| "completion_length": 2768.8541870117188, | |
| "epoch": 0.44, | |
| "grad_norm": 0.020549314096570015, | |
| "kl": 0.00025400519371032715, | |
| "learning_rate": 2.3967120531894857e-07, | |
| "loss": 0.0, | |
| "reward": 0.41839180141687393, | |
| "reward_std": 0.7576583102345467, | |
| "rewards/cosine_scaled_reward": -0.06163745652884245, | |
| "rewards/format_reward": 0.5416666716337204, | |
| "step": 385 | |
| }, | |
| { | |
| "completion_length": 2741.3541870117188, | |
| "epoch": 0.44114285714285717, | |
| "grad_norm": 0.022283930331468582, | |
| "kl": 0.00032389163970947266, | |
| "learning_rate": 2.374037332934512e-07, | |
| "loss": 0.0, | |
| "reward": 0.35035596787929535, | |
| "reward_std": 0.6659238189458847, | |
| "rewards/cosine_scaled_reward": -0.06440535286674276, | |
| "rewards/format_reward": 0.47916667722165585, | |
| "step": 386 | |
| }, | |
| { | |
| "completion_length": 3046.9791984558105, | |
| "epoch": 0.4422857142857143, | |
| "grad_norm": 0.031019089743494987, | |
| "kl": 0.0004036426544189453, | |
| "learning_rate": 2.3515149676898552e-07, | |
| "loss": 0.0, | |
| "reward": 0.1448732865974307, | |
| "reward_std": 0.7876984663307667, | |
| "rewards/cosine_scaled_reward": -0.0421467050909996, | |
| "rewards/format_reward": 0.2291666753590107, | |
| "step": 387 | |
| }, | |
| { | |
| "completion_length": 2511.062530517578, | |
| "epoch": 0.44342857142857145, | |
| "grad_norm": 0.025159146636724472, | |
| "kl": 0.0003024935722351074, | |
| "learning_rate": 2.3291460551638237e-07, | |
| "loss": 0.0, | |
| "reward": 0.6130173578858376, | |
| "reward_std": 0.545312637463212, | |
| "rewards/cosine_scaled_reward": 0.06692530773580074, | |
| "rewards/format_reward": 0.47916667722165585, | |
| "step": 388 | |
| }, | |
| { | |
| "completion_length": 2439.0833854675293, | |
| "epoch": 0.44457142857142856, | |
| "grad_norm": 0.028031574562191963, | |
| "kl": 0.00030285120010375977, | |
| "learning_rate": 2.306931685585657e-07, | |
| "loss": 0.0, | |
| "reward": 0.5443721693009138, | |
| "reward_std": 0.6922687292098999, | |
| "rewards/cosine_scaled_reward": -0.01948060654103756, | |
| "rewards/format_reward": 0.5833333507180214, | |
| "step": 389 | |
| }, | |
| { | |
| "completion_length": 1954.3542175292969, | |
| "epoch": 0.44571428571428573, | |
| "grad_norm": 0.030413255095481873, | |
| "kl": 0.00020575523376464844, | |
| "learning_rate": 2.2848729416523859e-07, | |
| "loss": 0.0, | |
| "reward": 1.0101970732212067, | |
| "reward_std": 0.8248934336006641, | |
| "rewards/cosine_scaled_reward": 0.14051519706845284, | |
| "rewards/format_reward": 0.7291666716337204, | |
| "step": 390 | |
| }, | |
| { | |
| "completion_length": 2316.9583892822266, | |
| "epoch": 0.44685714285714284, | |
| "grad_norm": 0.035128720104694366, | |
| "kl": 0.00028714537620544434, | |
| "learning_rate": 2.2629708984760706e-07, | |
| "loss": 0.0, | |
| "reward": 1.2428898513317108, | |
| "reward_std": 0.6731934603303671, | |
| "rewards/cosine_scaled_reward": 0.2881115823984146, | |
| "rewards/format_reward": 0.6666666679084301, | |
| "step": 391 | |
| }, | |
| { | |
| "completion_length": 1697.4583587646484, | |
| "epoch": 0.448, | |
| "grad_norm": 0.03315872699022293, | |
| "kl": 0.00025475025177001953, | |
| "learning_rate": 2.2412266235313973e-07, | |
| "loss": 0.0, | |
| "reward": 0.8411284685134888, | |
| "reward_std": 0.5785612929612398, | |
| "rewards/cosine_scaled_reward": 0.04556424228940159, | |
| "rewards/format_reward": 0.7500000055879354, | |
| "step": 392 | |
| }, | |
| { | |
| "completion_length": 2015.7917022705078, | |
| "epoch": 0.4491428571428571, | |
| "grad_norm": 0.027014190331101418, | |
| "kl": 0.00029033422470092773, | |
| "learning_rate": 2.2196411766036487e-07, | |
| "loss": 0.0, | |
| "reward": 1.0128965936601162, | |
| "reward_std": 0.9798723421990871, | |
| "rewards/cosine_scaled_reward": 0.13144830497913063, | |
| "rewards/format_reward": 0.7500000018626451, | |
| "step": 393 | |
| }, | |
| { | |
| "completion_length": 2925.187545776367, | |
| "epoch": 0.4502857142857143, | |
| "grad_norm": 0.02542806603014469, | |
| "kl": 0.0003414154052734375, | |
| "learning_rate": 2.1982156097370557e-07, | |
| "loss": 0.0, | |
| "reward": 0.06015077605843544, | |
| "reward_std": 0.6904054302722216, | |
| "rewards/cosine_scaled_reward": -0.13659127056598663, | |
| "rewards/format_reward": 0.33333333767950535, | |
| "step": 394 | |
| }, | |
| { | |
| "completion_length": 2059.0833740234375, | |
| "epoch": 0.4514285714285714, | |
| "grad_norm": 0.055620644241571426, | |
| "kl": 0.0002554953098297119, | |
| "learning_rate": 2.1769509671835223e-07, | |
| "loss": 0.0, | |
| "reward": 0.24230369459837675, | |
| "reward_std": 0.5080111399292946, | |
| "rewards/cosine_scaled_reward": -0.139264827943407, | |
| "rewards/format_reward": 0.5208333395421505, | |
| "step": 395 | |
| }, | |
| { | |
| "completion_length": 2738.229217529297, | |
| "epoch": 0.45257142857142857, | |
| "grad_norm": 0.02174789272248745, | |
| "kl": 0.0003293752670288086, | |
| "learning_rate": 2.1558482853517253e-07, | |
| "loss": 0.0, | |
| "reward": 0.5809464631602168, | |
| "reward_std": 0.8570992723107338, | |
| "rewards/cosine_scaled_reward": 0.03005655948072672, | |
| "rewards/format_reward": 0.5208333432674408, | |
| "step": 396 | |
| }, | |
| { | |
| "completion_length": 2119.9375534057617, | |
| "epoch": 0.45371428571428574, | |
| "grad_norm": 0.02603338658809662, | |
| "kl": 0.000241011381149292, | |
| "learning_rate": 2.134908592756607e-07, | |
| "loss": 0.0, | |
| "reward": 0.9264453984797001, | |
| "reward_std": 0.5849390588700771, | |
| "rewards/cosine_scaled_reward": 0.0882226787507534, | |
| "rewards/format_reward": 0.7500000055879354, | |
| "step": 397 | |
| }, | |
| { | |
| "completion_length": 2522.7709045410156, | |
| "epoch": 0.45485714285714285, | |
| "grad_norm": 0.023683685809373856, | |
| "kl": 0.000274658203125, | |
| "learning_rate": 2.1141329099692406e-07, | |
| "loss": 0.0, | |
| "reward": 0.8225542418658733, | |
| "reward_std": 1.0746662057936192, | |
| "rewards/cosine_scaled_reward": 0.07794378884136677, | |
| "rewards/format_reward": 0.6666666734963655, | |
| "step": 398 | |
| }, | |
| { | |
| "completion_length": 1766.2083740234375, | |
| "epoch": 0.456, | |
| "grad_norm": 0.025745665654540062, | |
| "kl": 0.000264585018157959, | |
| "learning_rate": 2.0935222495670968e-07, | |
| "loss": 0.0, | |
| "reward": 1.4033365920186043, | |
| "reward_std": 0.6987381167709827, | |
| "rewards/cosine_scaled_reward": 0.26416830345988274, | |
| "rewards/format_reward": 0.875, | |
| "step": 399 | |
| }, | |
| { | |
| "completion_length": 1470.5000305175781, | |
| "epoch": 0.45714285714285713, | |
| "grad_norm": 0.030597828328609467, | |
| "kl": 0.00021582841873168945, | |
| "learning_rate": 2.0730776160846853e-07, | |
| "loss": 0.0, | |
| "reward": 1.6081257872283459, | |
| "reward_std": 0.7085444070398808, | |
| "rewards/cosine_scaled_reward": 0.33531289361417294, | |
| "rewards/format_reward": 0.9375000074505806, | |
| "step": 400 | |
| }, | |
| { | |
| "completion_length": 3003.7708587646484, | |
| "epoch": 0.4582857142857143, | |
| "grad_norm": 0.023032892495393753, | |
| "kl": 0.0003560781478881836, | |
| "learning_rate": 2.0528000059645995e-07, | |
| "loss": 0.0, | |
| "reward": 0.5061473976820707, | |
| "reward_std": 0.665686160326004, | |
| "rewards/cosine_scaled_reward": 0.08640703570563346, | |
| "rewards/format_reward": 0.3333333358168602, | |
| "step": 401 | |
| }, | |
| { | |
| "completion_length": 2160.0416946411133, | |
| "epoch": 0.4594285714285714, | |
| "grad_norm": 0.03311454504728317, | |
| "kl": 0.00029790401458740234, | |
| "learning_rate": 2.032690407508949e-07, | |
| "loss": 0.0, | |
| "reward": 0.4731251299381256, | |
| "reward_std": 0.45016609132289886, | |
| "rewards/cosine_scaled_reward": -0.03427077550441027, | |
| "rewards/format_reward": 0.5416666716337204, | |
| "step": 402 | |
| }, | |
| { | |
| "completion_length": 1579.2292251586914, | |
| "epoch": 0.4605714285714286, | |
| "grad_norm": 0.040304120630025864, | |
| "kl": 0.00027740001678466797, | |
| "learning_rate": 2.0127498008311922e-07, | |
| "loss": 0.0, | |
| "reward": 1.0009474894031882, | |
| "reward_std": 0.58954899571836, | |
| "rewards/cosine_scaled_reward": 0.08380707260221243, | |
| "rewards/format_reward": 0.8333333414047956, | |
| "step": 403 | |
| }, | |
| { | |
| "completion_length": 2295.062557220459, | |
| "epoch": 0.4617142857142857, | |
| "grad_norm": 0.03594672679901123, | |
| "kl": 0.0002923309803009033, | |
| "learning_rate": 1.9929791578083655e-07, | |
| "loss": 0.0, | |
| "reward": 0.6277561672031879, | |
| "reward_std": 0.576149944216013, | |
| "rewards/cosine_scaled_reward": 0.04304474964737892, | |
| "rewards/format_reward": 0.5416666716337204, | |
| "step": 404 | |
| }, | |
| { | |
| "completion_length": 2160.333339691162, | |
| "epoch": 0.46285714285714286, | |
| "grad_norm": 0.044510871171951294, | |
| "kl": 0.00038023293018341064, | |
| "learning_rate": 1.9733794420337213e-07, | |
| "loss": 0.0, | |
| "reward": 0.7880059815943241, | |
| "reward_std": 0.9082814259454608, | |
| "rewards/cosine_scaled_reward": 0.10233633872121572, | |
| "rewards/format_reward": 0.5833333395421505, | |
| "step": 405 | |
| }, | |
| { | |
| "completion_length": 2105.3125228881836, | |
| "epoch": 0.464, | |
| "grad_norm": 0.026859665289521217, | |
| "kl": 0.0003541707992553711, | |
| "learning_rate": 1.9539516087697517e-07, | |
| "loss": 0.0, | |
| "reward": 1.1059295609593391, | |
| "reward_std": 0.7627066448330879, | |
| "rewards/cosine_scaled_reward": 0.16754809115082026, | |
| "rewards/format_reward": 0.7708333432674408, | |
| "step": 406 | |
| }, | |
| { | |
| "completion_length": 2212.0625381469727, | |
| "epoch": 0.46514285714285714, | |
| "grad_norm": 0.02859460562467575, | |
| "kl": 0.00026991963386535645, | |
| "learning_rate": 1.934696604901642e-07, | |
| "loss": 0.0, | |
| "reward": 0.7443156484514475, | |
| "reward_std": 0.6628005839884281, | |
| "rewards/cosine_scaled_reward": 0.08049114793539047, | |
| "rewards/format_reward": 0.5833333395421505, | |
| "step": 407 | |
| }, | |
| { | |
| "completion_length": 2292.583381652832, | |
| "epoch": 0.4662857142857143, | |
| "grad_norm": 0.02938149869441986, | |
| "kl": 0.0002721548080444336, | |
| "learning_rate": 1.915615368891117e-07, | |
| "loss": 0.0, | |
| "reward": 0.84524005651474, | |
| "reward_std": 0.6734911154489964, | |
| "rewards/cosine_scaled_reward": 0.08928670734167099, | |
| "rewards/format_reward": 0.666666679084301, | |
| "step": 408 | |
| }, | |
| { | |
| "completion_length": 3335.9583740234375, | |
| "epoch": 0.4674285714285714, | |
| "grad_norm": 0.0185756403952837, | |
| "kl": 0.0003679990768432617, | |
| "learning_rate": 1.8967088307307e-07, | |
| "loss": 0.0, | |
| "reward": 0.31832231767475605, | |
| "reward_std": 0.9492303058505058, | |
| "rewards/cosine_scaled_reward": 0.002911144867539406, | |
| "rewards/format_reward": 0.31250000558793545, | |
| "step": 409 | |
| }, | |
| { | |
| "completion_length": 2268.0833740234375, | |
| "epoch": 0.4685714285714286, | |
| "grad_norm": 0.03002990409731865, | |
| "kl": 0.000295102596282959, | |
| "learning_rate": 1.8779779118983867e-07, | |
| "loss": 0.0, | |
| "reward": 0.7843060363084078, | |
| "reward_std": 1.074190242215991, | |
| "rewards/cosine_scaled_reward": 0.07965302001684904, | |
| "rewards/format_reward": 0.6250000093132257, | |
| "step": 410 | |
| }, | |
| { | |
| "completion_length": 2607.520881652832, | |
| "epoch": 0.4697142857142857, | |
| "grad_norm": 0.04299235716462135, | |
| "kl": 0.0003064870834350586, | |
| "learning_rate": 1.8594235253127372e-07, | |
| "loss": 0.0, | |
| "reward": 0.3397108046337962, | |
| "reward_std": 0.7905894294381142, | |
| "rewards/cosine_scaled_reward": -0.04889458604156971, | |
| "rewards/format_reward": 0.4375000111758709, | |
| "step": 411 | |
| }, | |
| { | |
| "completion_length": 2604.625030517578, | |
| "epoch": 0.47085714285714286, | |
| "grad_norm": 0.02590368315577507, | |
| "kl": 0.00034099817276000977, | |
| "learning_rate": 1.8410465752883758e-07, | |
| "loss": 0.0, | |
| "reward": 1.047096872702241, | |
| "reward_std": 0.7374865831807256, | |
| "rewards/cosine_scaled_reward": 0.2631317935883999, | |
| "rewards/format_reward": 0.520833333954215, | |
| "step": 412 | |
| }, | |
| { | |
| "completion_length": 2449.8750381469727, | |
| "epoch": 0.472, | |
| "grad_norm": 0.023459814488887787, | |
| "kl": 0.0002518892288208008, | |
| "learning_rate": 1.822847957491922e-07, | |
| "loss": 0.0, | |
| "reward": 0.7442583926022053, | |
| "reward_std": 0.9269864112138748, | |
| "rewards/cosine_scaled_reward": 0.09087921027094126, | |
| "rewards/format_reward": 0.5625000111758709, | |
| "step": 413 | |
| }, | |
| { | |
| "completion_length": 2950.354217529297, | |
| "epoch": 0.47314285714285714, | |
| "grad_norm": 0.01904076710343361, | |
| "kl": 0.0002581775188446045, | |
| "learning_rate": 1.804828558898332e-07, | |
| "loss": 0.0, | |
| "reward": 0.674991624429822, | |
| "reward_std": 0.8730818629264832, | |
| "rewards/cosine_scaled_reward": 0.06666246941313148, | |
| "rewards/format_reward": 0.541666679084301, | |
| "step": 414 | |
| }, | |
| { | |
| "completion_length": 3076.6666870117188, | |
| "epoch": 0.4742857142857143, | |
| "grad_norm": 0.023224493488669395, | |
| "kl": 0.0003394484519958496, | |
| "learning_rate": 1.7869892577476722e-07, | |
| "loss": 0.0, | |
| "reward": 0.21714868023991585, | |
| "reward_std": 0.7628773404285312, | |
| "rewards/cosine_scaled_reward": -0.05809233826585114, | |
| "rewards/format_reward": 0.3333333358168602, | |
| "step": 415 | |
| }, | |
| { | |
| "completion_length": 1542.4583740234375, | |
| "epoch": 0.4754285714285714, | |
| "grad_norm": 0.030471056699752808, | |
| "kl": 0.00021645426750183105, | |
| "learning_rate": 1.7693309235023127e-07, | |
| "loss": 0.0, | |
| "reward": 0.9979265034198761, | |
| "reward_std": 0.8310877997428179, | |
| "rewards/cosine_scaled_reward": 0.0927132535725832, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 416 | |
| }, | |
| { | |
| "completion_length": 3084.7083587646484, | |
| "epoch": 0.4765714285714286, | |
| "grad_norm": 0.021396063268184662, | |
| "kl": 0.00035440921783447266, | |
| "learning_rate": 1.7518544168045524e-07, | |
| "loss": 0.0, | |
| "reward": 0.03550974000245333, | |
| "reward_std": 0.7396349869668484, | |
| "rewards/cosine_scaled_reward": -0.12807846441864967, | |
| "rewards/format_reward": 0.29166667722165585, | |
| "step": 417 | |
| }, | |
| { | |
| "completion_length": 2215.229232788086, | |
| "epoch": 0.4777142857142857, | |
| "grad_norm": 0.033279899507761, | |
| "kl": 0.00040906667709350586, | |
| "learning_rate": 1.7345605894346726e-07, | |
| "loss": 0.0, | |
| "reward": 0.7383007109165192, | |
| "reward_std": 0.3981512626633048, | |
| "rewards/cosine_scaled_reward": 0.07748369872570038, | |
| "rewards/format_reward": 0.5833333358168602, | |
| "step": 418 | |
| }, | |
| { | |
| "completion_length": 2424.5625381469727, | |
| "epoch": 0.47885714285714287, | |
| "grad_norm": 0.027742447331547737, | |
| "kl": 0.00024384260177612305, | |
| "learning_rate": 1.7174502842694212e-07, | |
| "loss": 0.0, | |
| "reward": 0.6631737016141415, | |
| "reward_std": 0.6519222892820835, | |
| "rewards/cosine_scaled_reward": 0.07117018103599548, | |
| "rewards/format_reward": 0.520833333954215, | |
| "step": 419 | |
| }, | |
| { | |
| "completion_length": 1580.56254196167, | |
| "epoch": 0.48, | |
| "grad_norm": 0.04106001928448677, | |
| "kl": 0.00024759769439697266, | |
| "learning_rate": 1.7005243352409333e-07, | |
| "loss": 0.0, | |
| "reward": 0.5669936803169549, | |
| "reward_std": 0.6721935961395502, | |
| "rewards/cosine_scaled_reward": -0.11233649682253599, | |
| "rewards/format_reward": 0.7916666679084301, | |
| "step": 420 | |
| }, | |
| { | |
| "completion_length": 2925.1250228881836, | |
| "epoch": 0.48114285714285715, | |
| "grad_norm": 0.027175476774573326, | |
| "kl": 0.0003466606140136719, | |
| "learning_rate": 1.6837835672960831e-07, | |
| "loss": 0.0, | |
| "reward": 0.4191740155220032, | |
| "reward_std": 0.6842759978026152, | |
| "rewards/cosine_scaled_reward": 0.0012536547146737576, | |
| "rewards/format_reward": 0.4166666716337204, | |
| "step": 421 | |
| }, | |
| { | |
| "completion_length": 2770.1458740234375, | |
| "epoch": 0.48228571428571426, | |
| "grad_norm": 0.0218949094414711, | |
| "kl": 0.00034099817276000977, | |
| "learning_rate": 1.6672287963562852e-07, | |
| "loss": 0.0, | |
| "reward": 0.4831668883562088, | |
| "reward_std": 0.7019177824258804, | |
| "rewards/cosine_scaled_reward": 0.012416768004186451, | |
| "rewards/format_reward": 0.4583333358168602, | |
| "step": 422 | |
| }, | |
| { | |
| "completion_length": 2963.6667098999023, | |
| "epoch": 0.48342857142857143, | |
| "grad_norm": 0.024210037663578987, | |
| "kl": 0.0003743171691894531, | |
| "learning_rate": 1.6508608292777203e-07, | |
| "loss": 0.0, | |
| "reward": 0.4695848897099495, | |
| "reward_std": 0.885172575712204, | |
| "rewards/cosine_scaled_reward": 0.005625786259770393, | |
| "rewards/format_reward": 0.4583333358168602, | |
| "step": 423 | |
| }, | |
| { | |
| "completion_length": 2673.93758392334, | |
| "epoch": 0.4845714285714286, | |
| "grad_norm": 0.028210051357746124, | |
| "kl": 0.00032579898834228516, | |
| "learning_rate": 1.6346804638120098e-07, | |
| "loss": 0.0, | |
| "reward": 0.5200405437499285, | |
| "reward_std": 0.9913763739168644, | |
| "rewards/cosine_scaled_reward": 0.041270275600254536, | |
| "rewards/format_reward": 0.4375000074505806, | |
| "step": 424 | |
| }, | |
| { | |
| "completion_length": 1474.7500305175781, | |
| "epoch": 0.4857142857142857, | |
| "grad_norm": 0.032733287662267685, | |
| "kl": 0.00016742944717407227, | |
| "learning_rate": 1.6186884885673413e-07, | |
| "loss": 0.0, | |
| "reward": 2.216394782066345, | |
| "reward_std": 0.6638942826539278, | |
| "rewards/cosine_scaled_reward": 0.6186139956116676, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 425 | |
| }, | |
| { | |
| "completion_length": 1995.0416946411133, | |
| "epoch": 0.4868571428571429, | |
| "grad_norm": 0.041653890162706375, | |
| "kl": 0.00038945674896240234, | |
| "learning_rate": 1.6028856829700258e-07, | |
| "loss": 0.0, | |
| "reward": 0.8248560577630997, | |
| "reward_std": 0.8269761241972446, | |
| "rewards/cosine_scaled_reward": 0.11034468945581466, | |
| "rewards/format_reward": 0.6041666716337204, | |
| "step": 426 | |
| }, | |
| { | |
| "completion_length": 2970.2083587646484, | |
| "epoch": 0.488, | |
| "grad_norm": 0.022912204265594482, | |
| "kl": 0.0003581047058105469, | |
| "learning_rate": 1.5872728172265146e-07, | |
| "loss": 0.0, | |
| "reward": 0.3813675809651613, | |
| "reward_std": 0.5880897492170334, | |
| "rewards/cosine_scaled_reward": 0.024017130956053734, | |
| "rewards/format_reward": 0.33333333395421505, | |
| "step": 427 | |
| }, | |
| { | |
| "completion_length": 2192.604202270508, | |
| "epoch": 0.48914285714285716, | |
| "grad_norm": 0.03489791974425316, | |
| "kl": 0.0003153085708618164, | |
| "learning_rate": 1.5718506522858572e-07, | |
| "loss": 0.0, | |
| "reward": 0.6708185374736786, | |
| "reward_std": 1.0236610621213913, | |
| "rewards/cosine_scaled_reward": 0.03332593198865652, | |
| "rewards/format_reward": 0.604166679084301, | |
| "step": 428 | |
| }, | |
| { | |
| "completion_length": 2013.4167098999023, | |
| "epoch": 0.49028571428571427, | |
| "grad_norm": 0.03182235732674599, | |
| "kl": 0.00034117698669433594, | |
| "learning_rate": 1.5566199398026147e-07, | |
| "loss": 0.0, | |
| "reward": 0.7822023816406727, | |
| "reward_std": 0.6942301616072655, | |
| "rewards/cosine_scaled_reward": 0.01610116008669138, | |
| "rewards/format_reward": 0.7500000111758709, | |
| "step": 429 | |
| }, | |
| { | |
| "completion_length": 2299.312511444092, | |
| "epoch": 0.49142857142857144, | |
| "grad_norm": 0.0236895140260458, | |
| "kl": 0.0003446042537689209, | |
| "learning_rate": 1.5415814221002265e-07, | |
| "loss": 0.0, | |
| "reward": 0.884235505014658, | |
| "reward_std": 0.5636085197329521, | |
| "rewards/cosine_scaled_reward": 0.129617752507329, | |
| "rewards/format_reward": 0.625, | |
| "step": 430 | |
| }, | |
| { | |
| "completion_length": 2268.312530517578, | |
| "epoch": 0.49257142857142855, | |
| "grad_norm": 0.028193598613142967, | |
| "kl": 0.0004367828369140625, | |
| "learning_rate": 1.5267358321348285e-07, | |
| "loss": 0.0, | |
| "reward": 0.5578006114810705, | |
| "reward_std": 0.7199101895093918, | |
| "rewards/cosine_scaled_reward": -0.023183029145002365, | |
| "rewards/format_reward": 0.6041666809469461, | |
| "step": 431 | |
| }, | |
| { | |
| "completion_length": 2806.8125610351562, | |
| "epoch": 0.4937142857142857, | |
| "grad_norm": 0.02204781211912632, | |
| "kl": 0.00034296512603759766, | |
| "learning_rate": 1.5120838934595337e-07, | |
| "loss": 0.0, | |
| "reward": 0.5058688074350357, | |
| "reward_std": 1.0230275467038155, | |
| "rewards/cosine_scaled_reward": 0.0237677285913378, | |
| "rewards/format_reward": 0.45833334513008595, | |
| "step": 432 | |
| }, | |
| { | |
| "completion_length": 2901.750030517578, | |
| "epoch": 0.4948571428571429, | |
| "grad_norm": 0.023653727024793625, | |
| "kl": 0.00032711029052734375, | |
| "learning_rate": 1.4976263201891613e-07, | |
| "loss": 0.0, | |
| "reward": 0.39292088645743206, | |
| "reward_std": 0.6577230170369148, | |
| "rewards/cosine_scaled_reward": 0.0193770844489336, | |
| "rewards/format_reward": 0.35416666977107525, | |
| "step": 433 | |
| }, | |
| { | |
| "completion_length": 2797.5208740234375, | |
| "epoch": 0.496, | |
| "grad_norm": 0.026688504964113235, | |
| "kl": 0.0003750920295715332, | |
| "learning_rate": 1.483363816965435e-07, | |
| "loss": 0.0, | |
| "reward": -0.11356328427791595, | |
| "reward_std": 0.5170617178082466, | |
| "rewards/cosine_scaled_reward": -0.23386498540639877, | |
| "rewards/format_reward": 0.3541666679084301, | |
| "step": 434 | |
| }, | |
| { | |
| "completion_length": 2098.0416946411133, | |
| "epoch": 0.49714285714285716, | |
| "grad_norm": 0.03707343339920044, | |
| "kl": 0.00035834312438964844, | |
| "learning_rate": 1.469297078922642e-07, | |
| "loss": 0.0, | |
| "reward": 0.45782112469896674, | |
| "reward_std": 0.514393039047718, | |
| "rewards/cosine_scaled_reward": -0.07317277602851391, | |
| "rewards/format_reward": 0.6041666716337204, | |
| "step": 435 | |
| }, | |
| { | |
| "completion_length": 1943.4167022705078, | |
| "epoch": 0.4982857142857143, | |
| "grad_norm": 0.031391605734825134, | |
| "kl": 0.0002682805061340332, | |
| "learning_rate": 1.4554267916537495e-07, | |
| "loss": 0.0, | |
| "reward": 1.0878197699785233, | |
| "reward_std": 0.7217961475253105, | |
| "rewards/cosine_scaled_reward": 0.15849320590496063, | |
| "rewards/format_reward": 0.7708333376795053, | |
| "step": 436 | |
| }, | |
| { | |
| "completion_length": 2698.0000228881836, | |
| "epoch": 0.49942857142857144, | |
| "grad_norm": 0.02348100021481514, | |
| "kl": 0.0003057122230529785, | |
| "learning_rate": 1.4417536311769885e-07, | |
| "loss": 0.0, | |
| "reward": 0.29235613951459527, | |
| "reward_std": 0.37082277424633503, | |
| "rewards/cosine_scaled_reward": -0.06215526815503836, | |
| "rewards/format_reward": 0.4166666679084301, | |
| "step": 437 | |
| }, | |
| { | |
| "completion_length": 2779.583396911621, | |
| "epoch": 0.5005714285714286, | |
| "grad_norm": 0.021524518728256226, | |
| "kl": 0.0003088712692260742, | |
| "learning_rate": 1.4282782639029128e-07, | |
| "loss": 0.0, | |
| "reward": 0.6523042246699333, | |
| "reward_std": 0.884812381118536, | |
| "rewards/cosine_scaled_reward": 0.05531877093017101, | |
| "rewards/format_reward": 0.5416666772216558, | |
| "step": 438 | |
| }, | |
| { | |
| "completion_length": 2301.604217529297, | |
| "epoch": 0.5017142857142857, | |
| "grad_norm": 0.02649969980120659, | |
| "kl": 0.00031888484954833984, | |
| "learning_rate": 1.4150013466019114e-07, | |
| "loss": 0.0, | |
| "reward": 0.5825948305428028, | |
| "reward_std": 0.844958171248436, | |
| "rewards/cosine_scaled_reward": -0.010785935446619987, | |
| "rewards/format_reward": 0.6041666734963655, | |
| "step": 439 | |
| }, | |
| { | |
| "completion_length": 2899.458339691162, | |
| "epoch": 0.5028571428571429, | |
| "grad_norm": 0.026726072654128075, | |
| "kl": 0.0004227161407470703, | |
| "learning_rate": 1.4019235263722034e-07, | |
| "loss": 0.0, | |
| "reward": -0.10262651741504669, | |
| "reward_std": 0.5177539475262165, | |
| "rewards/cosine_scaled_reward": -0.17631326615810394, | |
| "rewards/format_reward": 0.25, | |
| "step": 440 | |
| }, | |
| { | |
| "completion_length": 2830.2292137145996, | |
| "epoch": 0.504, | |
| "grad_norm": 0.024895742535591125, | |
| "kl": 0.000400543212890625, | |
| "learning_rate": 1.3890454406082956e-07, | |
| "loss": 0.0, | |
| "reward": 0.5320205697789788, | |
| "reward_std": 0.6080023515969515, | |
| "rewards/cosine_scaled_reward": 0.05767695792019367, | |
| "rewards/format_reward": 0.41666667349636555, | |
| "step": 441 | |
| }, | |
| { | |
| "completion_length": 2856.729179382324, | |
| "epoch": 0.5051428571428571, | |
| "grad_norm": 0.024759601801633835, | |
| "kl": 0.00035691261291503906, | |
| "learning_rate": 1.3763677169699217e-07, | |
| "loss": 0.0, | |
| "reward": 0.14839190989732742, | |
| "reward_std": 0.5381195954978466, | |
| "rewards/cosine_scaled_reward": -0.11330404752516188, | |
| "rewards/format_reward": 0.375, | |
| "step": 442 | |
| }, | |
| { | |
| "completion_length": 3081.895896911621, | |
| "epoch": 0.5062857142857143, | |
| "grad_norm": 0.022310299798846245, | |
| "kl": 0.00034332275390625, | |
| "learning_rate": 1.3638909733514452e-07, | |
| "loss": 0.0, | |
| "reward": 0.271579097956419, | |
| "reward_std": 0.840357281267643, | |
| "rewards/cosine_scaled_reward": -0.02046043984591961, | |
| "rewards/format_reward": 0.31250000186264515, | |
| "step": 443 | |
| }, | |
| { | |
| "completion_length": 2853.8333587646484, | |
| "epoch": 0.5074285714285715, | |
| "grad_norm": 0.024710780009627342, | |
| "kl": 0.00029730796813964844, | |
| "learning_rate": 1.351615817851748e-07, | |
| "loss": 0.0, | |
| "reward": -0.03774651139974594, | |
| "reward_std": 0.6097207479178905, | |
| "rewards/cosine_scaled_reward": -0.175123262219131, | |
| "rewards/format_reward": 0.31250000186264515, | |
| "step": 444 | |
| }, | |
| { | |
| "completion_length": 2816.8125534057617, | |
| "epoch": 0.5085714285714286, | |
| "grad_norm": 0.023254111409187317, | |
| "kl": 0.0003762245178222656, | |
| "learning_rate": 1.3395428487445914e-07, | |
| "loss": 0.0, | |
| "reward": 0.46338549302890897, | |
| "reward_std": 0.5440634004771709, | |
| "rewards/cosine_scaled_reward": -0.05997392535209656, | |
| "rewards/format_reward": 0.5833333414047956, | |
| "step": 445 | |
| }, | |
| { | |
| "completion_length": 2824.0416946411133, | |
| "epoch": 0.5097142857142857, | |
| "grad_norm": 0.0227351114153862, | |
| "kl": 0.0003020167350769043, | |
| "learning_rate": 1.3276726544494571e-07, | |
| "loss": 0.0, | |
| "reward": 0.1085881874896586, | |
| "reward_std": 0.8252651356160641, | |
| "rewards/cosine_scaled_reward": -0.13320591230876744, | |
| "rewards/format_reward": 0.3750000037252903, | |
| "step": 446 | |
| }, | |
| { | |
| "completion_length": 1687.4791831970215, | |
| "epoch": 0.5108571428571429, | |
| "grad_norm": 0.03260977193713188, | |
| "kl": 0.00029033422470092773, | |
| "learning_rate": 1.316005813502869e-07, | |
| "loss": 0.0, | |
| "reward": 0.66987594217062, | |
| "reward_std": 0.68526791036129, | |
| "rewards/cosine_scaled_reward": -0.05047871172428131, | |
| "rewards/format_reward": 0.7708333395421505, | |
| "step": 447 | |
| }, | |
| { | |
| "completion_length": 1887.5000305175781, | |
| "epoch": 0.512, | |
| "grad_norm": 0.03330305963754654, | |
| "kl": 0.0002282261848449707, | |
| "learning_rate": 1.3045428945301953e-07, | |
| "loss": 0.0, | |
| "reward": 0.6945083662867546, | |
| "reward_std": 0.6089707408100367, | |
| "rewards/cosine_scaled_reward": 0.013920823112130165, | |
| "rewards/format_reward": 0.6666666679084301, | |
| "step": 448 | |
| }, | |
| { | |
| "completion_length": 2491.375026702881, | |
| "epoch": 0.5131428571428571, | |
| "grad_norm": 0.02716596983373165, | |
| "kl": 0.00034224987030029297, | |
| "learning_rate": 1.2932844562179352e-07, | |
| "loss": 0.0, | |
| "reward": 0.04862409457564354, | |
| "reward_std": 0.44541748240590096, | |
| "rewards/cosine_scaled_reward": -0.18402129039168358, | |
| "rewards/format_reward": 0.4166666679084301, | |
| "step": 449 | |
| }, | |
| { | |
| "completion_length": 2376.3333702087402, | |
| "epoch": 0.5142857142857142, | |
| "grad_norm": 0.026498140767216682, | |
| "kl": 0.000276029109954834, | |
| "learning_rate": 1.2822310472864885e-07, | |
| "loss": 0.0, | |
| "reward": 0.45986196398735046, | |
| "reward_std": 0.5138499777531251, | |
| "rewards/cosine_scaled_reward": -0.009652364067733288, | |
| "rewards/format_reward": 0.4791666679084301, | |
| "step": 450 | |
| }, | |
| { | |
| "completion_length": 2555.812526702881, | |
| "epoch": 0.5154285714285715, | |
| "grad_norm": 0.03134075179696083, | |
| "kl": 0.00034427642822265625, | |
| "learning_rate": 1.2713832064634125e-07, | |
| "loss": 0.0, | |
| "reward": 0.52835633745417, | |
| "reward_std": 0.5414374638348818, | |
| "rewards/cosine_scaled_reward": -0.017071835696697235, | |
| "rewards/format_reward": 0.5625000055879354, | |
| "step": 451 | |
| }, | |
| { | |
| "completion_length": 2659.3542098999023, | |
| "epoch": 0.5165714285714286, | |
| "grad_norm": 0.025225255638360977, | |
| "kl": 0.00030180811882019043, | |
| "learning_rate": 1.260741462457165e-07, | |
| "loss": 0.0, | |
| "reward": 0.6744653210043907, | |
| "reward_std": 0.7822801172733307, | |
| "rewards/cosine_scaled_reward": 0.06639931770041585, | |
| "rewards/format_reward": 0.5416666716337204, | |
| "step": 452 | |
| }, | |
| { | |
| "completion_length": 2583.979259490967, | |
| "epoch": 0.5177142857142857, | |
| "grad_norm": 0.0343073271214962, | |
| "kl": 0.0003584623336791992, | |
| "learning_rate": 1.2503063339313356e-07, | |
| "loss": 0.0, | |
| "reward": 0.6048378571867943, | |
| "reward_std": 0.8740231655538082, | |
| "rewards/cosine_scaled_reward": 0.05241893231868744, | |
| "rewards/format_reward": 0.5000000055879354, | |
| "step": 453 | |
| }, | |
| { | |
| "completion_length": 2162.4167098999023, | |
| "epoch": 0.5188571428571429, | |
| "grad_norm": 0.024462906643748283, | |
| "kl": 0.0002703070640563965, | |
| "learning_rate": 1.2400783294793668e-07, | |
| "loss": 0.0, | |
| "reward": 0.7111623287200928, | |
| "reward_std": 0.5601445361971855, | |
| "rewards/cosine_scaled_reward": -0.00900217518210411, | |
| "rewards/format_reward": 0.7291666716337204, | |
| "step": 454 | |
| }, | |
| { | |
| "completion_length": 2792.5416831970215, | |
| "epoch": 0.52, | |
| "grad_norm": 0.032766759395599365, | |
| "kl": 0.00035133957862854004, | |
| "learning_rate": 1.2300579475997657e-07, | |
| "loss": 0.0, | |
| "reward": -0.10949165653437376, | |
| "reward_std": 0.4895745702087879, | |
| "rewards/cosine_scaled_reward": -0.26307916827499866, | |
| "rewards/format_reward": 0.4166666716337204, | |
| "step": 455 | |
| }, | |
| { | |
| "completion_length": 3024.1875228881836, | |
| "epoch": 0.5211428571428571, | |
| "grad_norm": 0.02690344676375389, | |
| "kl": 0.00028765201568603516, | |
| "learning_rate": 1.220245676671809e-07, | |
| "loss": 0.0, | |
| "reward": 0.023419101489707828, | |
| "reward_std": 0.571003105957061, | |
| "rewards/cosine_scaled_reward": -0.1549571119248867, | |
| "rewards/format_reward": 0.3333333358168602, | |
| "step": 456 | |
| }, | |
| { | |
| "completion_length": 2981.645881652832, | |
| "epoch": 0.5222857142857142, | |
| "grad_norm": 0.03079189360141754, | |
| "kl": 0.0004329681396484375, | |
| "learning_rate": 1.2106419949317388e-07, | |
| "loss": 0.0, | |
| "reward": 0.26157396100461483, | |
| "reward_std": 0.710116732865572, | |
| "rewards/cosine_scaled_reward": -0.046296343207359314, | |
| "rewards/format_reward": 0.3541666679084301, | |
| "step": 457 | |
| }, | |
| { | |
| "completion_length": 2113.458354949951, | |
| "epoch": 0.5234285714285715, | |
| "grad_norm": 0.03327843174338341, | |
| "kl": 0.00027066469192504883, | |
| "learning_rate": 1.2012473704494537e-07, | |
| "loss": 0.0, | |
| "reward": 0.6744469236582518, | |
| "reward_std": 0.7916180063039064, | |
| "rewards/cosine_scaled_reward": 0.024723445996642113, | |
| "rewards/format_reward": 0.6250000093132257, | |
| "step": 458 | |
| }, | |
| { | |
| "completion_length": 1241.8541870117188, | |
| "epoch": 0.5245714285714286, | |
| "grad_norm": 0.03587225452065468, | |
| "kl": 0.00023978948593139648, | |
| "learning_rate": 1.1920622611056974e-07, | |
| "loss": 0.0, | |
| "reward": 1.3831844218075275, | |
| "reward_std": 0.652434827759862, | |
| "rewards/cosine_scaled_reward": 0.2540921785403043, | |
| "rewards/format_reward": 0.875, | |
| "step": 459 | |
| }, | |
| { | |
| "completion_length": 2897.6250610351562, | |
| "epoch": 0.5257142857142857, | |
| "grad_norm": 0.023615367710590363, | |
| "kl": 0.0003662109375, | |
| "learning_rate": 1.1830871145697412e-07, | |
| "loss": 0.0, | |
| "reward": 0.6960660554468632, | |
| "reward_std": 0.8808310199528933, | |
| "rewards/cosine_scaled_reward": 0.10844967514276505, | |
| "rewards/format_reward": 0.4791666716337204, | |
| "step": 460 | |
| }, | |
| { | |
| "completion_length": 2998.562545776367, | |
| "epoch": 0.5268571428571428, | |
| "grad_norm": 0.021689005196094513, | |
| "kl": 0.0003491640090942383, | |
| "learning_rate": 1.1743223682775649e-07, | |
| "loss": 0.0, | |
| "reward": 0.4467354565858841, | |
| "reward_std": 0.8461587205529213, | |
| "rewards/cosine_scaled_reward": 0.046284390380606055, | |
| "rewards/format_reward": 0.3541666753590107, | |
| "step": 461 | |
| }, | |
| { | |
| "completion_length": 2987.208335876465, | |
| "epoch": 0.528, | |
| "grad_norm": 0.027593903243541718, | |
| "kl": 0.0003991127014160156, | |
| "learning_rate": 1.1657684494105386e-07, | |
| "loss": 0.0, | |
| "reward": -0.17366999201476574, | |
| "reward_std": 0.5358176417648792, | |
| "rewards/cosine_scaled_reward": -0.19100166484713554, | |
| "rewards/format_reward": 0.2083333432674408, | |
| "step": 462 | |
| }, | |
| { | |
| "completion_length": 2619.4375610351562, | |
| "epoch": 0.5291428571428571, | |
| "grad_norm": 0.020383333787322044, | |
| "kl": 0.0002516806125640869, | |
| "learning_rate": 1.1574257748745986e-07, | |
| "loss": 0.0, | |
| "reward": 0.5960872750729322, | |
| "reward_std": 0.5420881155878305, | |
| "rewards/cosine_scaled_reward": 0.07929362449795008, | |
| "rewards/format_reward": 0.4375000037252903, | |
| "step": 463 | |
| }, | |
| { | |
| "completion_length": 1739.2500076293945, | |
| "epoch": 0.5302857142857142, | |
| "grad_norm": 0.03312518447637558, | |
| "kl": 0.0002790689468383789, | |
| "learning_rate": 1.1492947512799328e-07, | |
| "loss": 0.0, | |
| "reward": 0.8862170621287078, | |
| "reward_std": 0.4330704230815172, | |
| "rewards/cosine_scaled_reward": 0.07852520421147346, | |
| "rewards/format_reward": 0.7291666679084301, | |
| "step": 464 | |
| }, | |
| { | |
| "completion_length": 2631.979232788086, | |
| "epoch": 0.5314285714285715, | |
| "grad_norm": 0.028379030525684357, | |
| "kl": 0.00039136409759521484, | |
| "learning_rate": 1.1413757749211602e-07, | |
| "loss": 0.0, | |
| "reward": 0.547543827444315, | |
| "reward_std": 1.127050258219242, | |
| "rewards/cosine_scaled_reward": -0.017894762102514505, | |
| "rewards/format_reward": 0.5833333414047956, | |
| "step": 465 | |
| }, | |
| { | |
| "completion_length": 3075.208335876465, | |
| "epoch": 0.5325714285714286, | |
| "grad_norm": 0.022313648834824562, | |
| "kl": 0.00036776065826416016, | |
| "learning_rate": 1.1336692317580158e-07, | |
| "loss": 0.0, | |
| "reward": 0.19751616939902306, | |
| "reward_std": 0.7597125731408596, | |
| "rewards/cosine_scaled_reward": -0.04707525676349178, | |
| "rewards/format_reward": 0.2916666716337204, | |
| "step": 466 | |
| }, | |
| { | |
| "completion_length": 2750.958354949951, | |
| "epoch": 0.5337142857142857, | |
| "grad_norm": 0.02687336690723896, | |
| "kl": 0.0003802776336669922, | |
| "learning_rate": 1.1261754973965422e-07, | |
| "loss": 0.0, | |
| "reward": 0.6484562270343304, | |
| "reward_std": 0.9428794495761395, | |
| "rewards/cosine_scaled_reward": 0.07422811887226999, | |
| "rewards/format_reward": 0.5000000074505806, | |
| "step": 467 | |
| }, | |
| { | |
| "completion_length": 2817.1666984558105, | |
| "epoch": 0.5348571428571428, | |
| "grad_norm": 0.02439255081117153, | |
| "kl": 0.00035244226455688477, | |
| "learning_rate": 1.1188949370707787e-07, | |
| "loss": 0.0, | |
| "reward": 0.07247266173362732, | |
| "reward_std": 0.4720825038384646, | |
| "rewards/cosine_scaled_reward": -0.12001369334757328, | |
| "rewards/format_reward": 0.3125, | |
| "step": 468 | |
| }, | |
| { | |
| "completion_length": 2571.729202270508, | |
| "epoch": 0.536, | |
| "grad_norm": 0.030684776604175568, | |
| "kl": 0.00042176246643066406, | |
| "learning_rate": 1.1118279056249653e-07, | |
| "loss": 0.0, | |
| "reward": 0.2719057723879814, | |
| "reward_std": 0.8036453519016504, | |
| "rewards/cosine_scaled_reward": -0.05154711566865444, | |
| "rewards/format_reward": 0.3750000074505806, | |
| "step": 469 | |
| }, | |
| { | |
| "completion_length": 2709.229217529297, | |
| "epoch": 0.5371428571428571, | |
| "grad_norm": 0.022495390847325325, | |
| "kl": 0.00028634071350097656, | |
| "learning_rate": 1.1049747474962444e-07, | |
| "loss": 0.0, | |
| "reward": 0.35282724909484386, | |
| "reward_std": 0.6710633486509323, | |
| "rewards/cosine_scaled_reward": -0.10483637941069901, | |
| "rewards/format_reward": 0.5625000149011612, | |
| "step": 470 | |
| }, | |
| { | |
| "completion_length": 2807.5833587646484, | |
| "epoch": 0.5382857142857143, | |
| "grad_norm": 0.021343547850847244, | |
| "kl": 0.00029522180557250977, | |
| "learning_rate": 1.0983357966978745e-07, | |
| "loss": 0.0, | |
| "reward": 0.1727424543350935, | |
| "reward_std": 0.5023133680224419, | |
| "rewards/cosine_scaled_reward": -0.08029545284807682, | |
| "rewards/format_reward": 0.3333333358168602, | |
| "step": 471 | |
| }, | |
| { | |
| "completion_length": 2692.979202270508, | |
| "epoch": 0.5394285714285715, | |
| "grad_norm": 0.021070998162031174, | |
| "kl": 0.0003638267517089844, | |
| "learning_rate": 1.0919113768029517e-07, | |
| "loss": 0.0, | |
| "reward": 0.23685874231159687, | |
| "reward_std": 0.6326540801674128, | |
| "rewards/cosine_scaled_reward": -0.12115396093577147, | |
| "rewards/format_reward": 0.47916667349636555, | |
| "step": 472 | |
| }, | |
| { | |
| "completion_length": 2968.833366394043, | |
| "epoch": 0.5405714285714286, | |
| "grad_norm": 0.025542479008436203, | |
| "kl": 0.0003510713577270508, | |
| "learning_rate": 1.0857018009286381e-07, | |
| "loss": 0.0, | |
| "reward": 0.5230726413428783, | |
| "reward_std": 0.677385538816452, | |
| "rewards/cosine_scaled_reward": 0.06361966766417027, | |
| "rewards/format_reward": 0.3958333395421505, | |
| "step": 473 | |
| }, | |
| { | |
| "completion_length": 2142.437545776367, | |
| "epoch": 0.5417142857142857, | |
| "grad_norm": 0.026798170059919357, | |
| "kl": 0.00038617849349975586, | |
| "learning_rate": 1.0797073717209013e-07, | |
| "loss": 0.0, | |
| "reward": 1.0524125322699547, | |
| "reward_std": 0.6080759838223457, | |
| "rewards/cosine_scaled_reward": 0.23453958705067635, | |
| "rewards/format_reward": 0.5833333395421505, | |
| "step": 474 | |
| }, | |
| { | |
| "completion_length": 2133.979179382324, | |
| "epoch": 0.5428571428571428, | |
| "grad_norm": 0.026474615558981895, | |
| "kl": 0.0002849102020263672, | |
| "learning_rate": 1.0739283813397639e-07, | |
| "loss": 0.0, | |
| "reward": 1.4010358676314354, | |
| "reward_std": 0.9220043309032917, | |
| "rewards/cosine_scaled_reward": 0.2838512365706265, | |
| "rewards/format_reward": 0.8333333358168602, | |
| "step": 475 | |
| }, | |
| { | |
| "completion_length": 2235.916702270508, | |
| "epoch": 0.544, | |
| "grad_norm": 0.02376784197986126, | |
| "kl": 0.0003908872604370117, | |
| "learning_rate": 1.068365111445064e-07, | |
| "loss": 0.0, | |
| "reward": 1.4387648329138756, | |
| "reward_std": 1.0878624990582466, | |
| "rewards/cosine_scaled_reward": 0.3339657662436366, | |
| "rewards/format_reward": 0.7708333507180214, | |
| "step": 476 | |
| }, | |
| { | |
| "completion_length": 1504.1667022705078, | |
| "epoch": 0.5451428571428572, | |
| "grad_norm": 0.031137563288211823, | |
| "kl": 0.00024572014808654785, | |
| "learning_rate": 1.063017833182728e-07, | |
| "loss": 0.0, | |
| "reward": 1.313527725636959, | |
| "reward_std": 0.6671716086566448, | |
| "rewards/cosine_scaled_reward": 0.22968053398653865, | |
| "rewards/format_reward": 0.8541666716337204, | |
| "step": 477 | |
| }, | |
| { | |
| "completion_length": 2789.708366394043, | |
| "epoch": 0.5462857142857143, | |
| "grad_norm": 0.028890853747725487, | |
| "kl": 0.00031247735023498535, | |
| "learning_rate": 1.0578868071715544e-07, | |
| "loss": 0.0, | |
| "reward": 0.7100160159170628, | |
| "reward_std": 0.7924649901688099, | |
| "rewards/cosine_scaled_reward": 0.08417466329410672, | |
| "rewards/format_reward": 0.5416666716337204, | |
| "step": 478 | |
| }, | |
| { | |
| "completion_length": 2903.416702270508, | |
| "epoch": 0.5474285714285714, | |
| "grad_norm": 0.025586150586605072, | |
| "kl": 0.00043582916259765625, | |
| "learning_rate": 1.0529722834905125e-07, | |
| "loss": 0.0, | |
| "reward": 0.32888744259253144, | |
| "reward_std": 0.7015850022435188, | |
| "rewards/cosine_scaled_reward": -0.06472294870764017, | |
| "rewards/format_reward": 0.45833334513008595, | |
| "step": 479 | |
| }, | |
| { | |
| "completion_length": 2397.000045776367, | |
| "epoch": 0.5485714285714286, | |
| "grad_norm": 0.029168743640184402, | |
| "kl": 0.00034499168395996094, | |
| "learning_rate": 1.0482745016665526e-07, | |
| "loss": 0.0, | |
| "reward": 0.3047554672230035, | |
| "reward_std": 0.6797374170273542, | |
| "rewards/cosine_scaled_reward": -0.11845559580251575, | |
| "rewards/format_reward": 0.5416666734963655, | |
| "step": 480 | |
| }, | |
| { | |
| "completion_length": 2620.979217529297, | |
| "epoch": 0.5497142857142857, | |
| "grad_norm": 0.018642256036400795, | |
| "kl": 0.00029289722442626953, | |
| "learning_rate": 1.0437936906629334e-07, | |
| "loss": 0.0, | |
| "reward": 0.34190886095166206, | |
| "reward_std": 0.49963630735874176, | |
| "rewards/cosine_scaled_reward": -0.17279557418078184, | |
| "rewards/format_reward": 0.6875000055879354, | |
| "step": 481 | |
| }, | |
| { | |
| "completion_length": 2566.8333587646484, | |
| "epoch": 0.5508571428571428, | |
| "grad_norm": 0.03212100639939308, | |
| "kl": 0.00039565563201904297, | |
| "learning_rate": 1.0395300688680625e-07, | |
| "loss": 0.0, | |
| "reward": 0.8112224619835615, | |
| "reward_std": 0.8934317454695702, | |
| "rewards/cosine_scaled_reward": 0.1556112226098776, | |
| "rewards/format_reward": 0.5000000037252903, | |
| "step": 482 | |
| }, | |
| { | |
| "completion_length": 2690.5833892822266, | |
| "epoch": 0.552, | |
| "grad_norm": 0.02576862834393978, | |
| "kl": 0.00036197900772094727, | |
| "learning_rate": 1.0354838440848501e-07, | |
| "loss": 0.0, | |
| "reward": 0.5433402359485626, | |
| "reward_std": 0.7418918535113335, | |
| "rewards/cosine_scaled_reward": -0.04082988388836384, | |
| "rewards/format_reward": 0.625, | |
| "step": 483 | |
| }, | |
| { | |
| "completion_length": 2239.270881652832, | |
| "epoch": 0.5531428571428572, | |
| "grad_norm": 0.02764633297920227, | |
| "kl": 0.0002956390380859375, | |
| "learning_rate": 1.0316552135205837e-07, | |
| "loss": 0.0, | |
| "reward": 0.9058556277304888, | |
| "reward_std": 0.6842145500704646, | |
| "rewards/cosine_scaled_reward": 0.16126114130020142, | |
| "rewards/format_reward": 0.5833333432674408, | |
| "step": 484 | |
| }, | |
| { | |
| "completion_length": 1729.1667022705078, | |
| "epoch": 0.5542857142857143, | |
| "grad_norm": 0.032674845308065414, | |
| "kl": 0.00025451183319091797, | |
| "learning_rate": 1.0280443637773163e-07, | |
| "loss": 0.0, | |
| "reward": 0.7097102329134941, | |
| "reward_std": 0.5658861342817545, | |
| "rewards/cosine_scaled_reward": -0.020144885405898094, | |
| "rewards/format_reward": 0.7500000055879354, | |
| "step": 485 | |
| }, | |
| { | |
| "completion_length": 1834.5833435058594, | |
| "epoch": 0.5554285714285714, | |
| "grad_norm": 0.035480376332998276, | |
| "kl": 0.0004056096076965332, | |
| "learning_rate": 1.0246514708427701e-07, | |
| "loss": 0.0, | |
| "reward": 0.5227991119027138, | |
| "reward_std": 0.5087759112939239, | |
| "rewards/cosine_scaled_reward": -0.040683780796825886, | |
| "rewards/format_reward": 0.6041666716337204, | |
| "step": 486 | |
| }, | |
| { | |
| "completion_length": 1802.9166851043701, | |
| "epoch": 0.5565714285714286, | |
| "grad_norm": 0.042596880346536636, | |
| "kl": 0.0003460049629211426, | |
| "learning_rate": 1.0214767000817596e-07, | |
| "loss": 0.0, | |
| "reward": 1.4236529916524887, | |
| "reward_std": 0.7221514284610748, | |
| "rewards/cosine_scaled_reward": 0.3472431628033519, | |
| "rewards/format_reward": 0.7291666679084301, | |
| "step": 487 | |
| }, | |
| { | |
| "completion_length": 2004.6666679382324, | |
| "epoch": 0.5577142857142857, | |
| "grad_norm": 0.026572274044156075, | |
| "kl": 0.0002899765968322754, | |
| "learning_rate": 1.0185202062281336e-07, | |
| "loss": 0.0, | |
| "reward": 0.3370821550488472, | |
| "reward_std": 0.49599855951964855, | |
| "rewards/cosine_scaled_reward": -0.14395895414054394, | |
| "rewards/format_reward": 0.6250000074505806, | |
| "step": 488 | |
| }, | |
| { | |
| "completion_length": 3076.958354949951, | |
| "epoch": 0.5588571428571428, | |
| "grad_norm": 0.02607194148004055, | |
| "kl": 0.0003669261932373047, | |
| "learning_rate": 1.0157821333772304e-07, | |
| "loss": 0.0, | |
| "reward": 0.014304319396615028, | |
| "reward_std": 0.7019931301474571, | |
| "rewards/cosine_scaled_reward": -0.11784783285111189, | |
| "rewards/format_reward": 0.2500000037252903, | |
| "step": 489 | |
| }, | |
| { | |
| "completion_length": 1909.958366394043, | |
| "epoch": 0.56, | |
| "grad_norm": 0.03012062795460224, | |
| "kl": 0.00032451748847961426, | |
| "learning_rate": 1.013262614978859e-07, | |
| "loss": 0.0, | |
| "reward": 0.8168525598011911, | |
| "reward_std": 0.5254552476108074, | |
| "rewards/cosine_scaled_reward": 0.012592926621437073, | |
| "rewards/format_reward": 0.7916666716337204, | |
| "step": 490 | |
| }, | |
| { | |
| "completion_length": 2243.708351135254, | |
| "epoch": 0.5611428571428572, | |
| "grad_norm": 0.027276068925857544, | |
| "kl": 0.0002563595771789551, | |
| "learning_rate": 1.0109617738307911e-07, | |
| "loss": 0.0, | |
| "reward": 1.0440415889024734, | |
| "reward_std": 0.801654189825058, | |
| "rewards/cosine_scaled_reward": 0.17827080003917217, | |
| "rewards/format_reward": 0.6875000055879354, | |
| "step": 491 | |
| }, | |
| { | |
| "completion_length": 2608.958396911621, | |
| "epoch": 0.5622857142857143, | |
| "grad_norm": 0.03424741327762604, | |
| "kl": 0.00032341480255126953, | |
| "learning_rate": 1.0088797220727779e-07, | |
| "loss": 0.0, | |
| "reward": 0.48804343002848327, | |
| "reward_std": 0.7422212455421686, | |
| "rewards/cosine_scaled_reward": 0.014855045825242996, | |
| "rewards/format_reward": 0.4583333395421505, | |
| "step": 492 | |
| }, | |
| { | |
| "completion_length": 2090.6250381469727, | |
| "epoch": 0.5634285714285714, | |
| "grad_norm": 0.027274034917354584, | |
| "kl": 0.00032576918601989746, | |
| "learning_rate": 1.0070165611810855e-07, | |
| "loss": 0.0, | |
| "reward": 0.814801562577486, | |
| "reward_std": 0.9762416146695614, | |
| "rewards/cosine_scaled_reward": 0.021984107792377472, | |
| "rewards/format_reward": 0.7708333414047956, | |
| "step": 493 | |
| }, | |
| { | |
| "completion_length": 1355.145866394043, | |
| "epoch": 0.5645714285714286, | |
| "grad_norm": 0.0340682789683342, | |
| "kl": 0.0002550482749938965, | |
| "learning_rate": 1.005372381963547e-07, | |
| "loss": 0.0, | |
| "reward": 1.5873138904571533, | |
| "reward_std": 0.8576665632426739, | |
| "rewards/cosine_scaled_reward": 0.30407360196113586, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 494 | |
| }, | |
| { | |
| "completion_length": 2332.6041946411133, | |
| "epoch": 0.5657142857142857, | |
| "grad_norm": 0.027515675872564316, | |
| "kl": 0.0003358125686645508, | |
| "learning_rate": 1.0039472645551372e-07, | |
| "loss": 0.0, | |
| "reward": 0.4851825125515461, | |
| "reward_std": 0.766112394630909, | |
| "rewards/cosine_scaled_reward": -0.017825426533818245, | |
| "rewards/format_reward": 0.5208333395421505, | |
| "step": 495 | |
| }, | |
| { | |
| "completion_length": 1843.9792137145996, | |
| "epoch": 0.5668571428571428, | |
| "grad_norm": 0.03946586698293686, | |
| "kl": 0.00033032894134521484, | |
| "learning_rate": 1.002741278414069e-07, | |
| "loss": 0.0, | |
| "reward": 1.2787955980747938, | |
| "reward_std": 0.9207109902054071, | |
| "rewards/cosine_scaled_reward": 0.3060644338838756, | |
| "rewards/format_reward": 0.6666666679084301, | |
| "step": 496 | |
| }, | |
| { | |
| "completion_length": 2187.2917137145996, | |
| "epoch": 0.568, | |
| "grad_norm": 0.029607679694890976, | |
| "kl": 0.00023612380027770996, | |
| "learning_rate": 1.0017544823184055e-07, | |
| "loss": 0.0, | |
| "reward": 1.211494023911655, | |
| "reward_std": 0.6968455417081714, | |
| "rewards/cosine_scaled_reward": 0.2619969863444567, | |
| "rewards/format_reward": 0.6875000111758709, | |
| "step": 497 | |
| }, | |
| { | |
| "completion_length": 2317.7917098999023, | |
| "epoch": 0.5691428571428572, | |
| "grad_norm": 0.025722017511725426, | |
| "kl": 0.0003026127815246582, | |
| "learning_rate": 1.0009869243631952e-07, | |
| "loss": 0.0, | |
| "reward": 0.9396341107785702, | |
| "reward_std": 0.9828709922730923, | |
| "rewards/cosine_scaled_reward": 0.1469003539532423, | |
| "rewards/format_reward": 0.6458333414047956, | |
| "step": 498 | |
| }, | |
| { | |
| "completion_length": 2263.812545776367, | |
| "epoch": 0.5702857142857143, | |
| "grad_norm": 0.0247795432806015, | |
| "kl": 0.00023448467254638672, | |
| "learning_rate": 1.000438641958131e-07, | |
| "loss": 0.0, | |
| "reward": 0.7084082160145044, | |
| "reward_std": 0.8164577055722475, | |
| "rewards/cosine_scaled_reward": 0.041704108007252216, | |
| "rewards/format_reward": 0.6250000111758709, | |
| "step": 499 | |
| }, | |
| { | |
| "completion_length": 2721.2292404174805, | |
| "epoch": 0.5714285714285714, | |
| "grad_norm": 0.02151358313858509, | |
| "kl": 0.000295102596282959, | |
| "learning_rate": 1.0001096618257236e-07, | |
| "loss": 0.0, | |
| "reward": 0.7634283769875765, | |
| "reward_std": 0.8250323310494423, | |
| "rewards/cosine_scaled_reward": 0.11088085174560547, | |
| "rewards/format_reward": 0.5416666734963655, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.5714285714285714, | |
| "step": 500, | |
| "total_flos": 0.0, | |
| "train_loss": 8.274834573693823e-06, | |
| "train_runtime": 144683.3506, | |
| "train_samples_per_second": 0.166, | |
| "train_steps_per_second": 0.003 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 500, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 6, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |