| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.5714285714285714, | |
| "eval_steps": 500, | |
| "global_step": 500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "completion_length": 3001.9584350585938, | |
| "epoch": 0.001142857142857143, | |
| "grad_norm": 0.18922260403633118, | |
| "kl": 0.0, | |
| "learning_rate": 2e-08, | |
| "loss": -0.0, | |
| "reward": -0.010712452232837677, | |
| "reward_std": 0.48354096710681915, | |
| "rewards/cosine_scaled_reward": -0.1928562317043543, | |
| "rewards/format_reward": 0.37500000558793545, | |
| "step": 1 | |
| }, | |
| { | |
| "completion_length": 2822.541717529297, | |
| "epoch": 0.002285714285714286, | |
| "grad_norm": 0.28424975275993347, | |
| "kl": 0.0, | |
| "learning_rate": 4e-08, | |
| "loss": 0.0, | |
| "reward": 0.4385625521535985, | |
| "reward_std": 0.8208381980657578, | |
| "rewards/cosine_scaled_reward": -0.009885392151772976, | |
| "rewards/format_reward": 0.4583333432674408, | |
| "step": 2 | |
| }, | |
| { | |
| "completion_length": 2882.4166870117188, | |
| "epoch": 0.0034285714285714284, | |
| "grad_norm": 0.18410934507846832, | |
| "kl": 3.517171717248857e-05, | |
| "learning_rate": 6e-08, | |
| "loss": 0.0, | |
| "reward": -0.291525443084538, | |
| "reward_std": 0.3761885389685631, | |
| "rewards/cosine_scaled_reward": -0.27076271921396255, | |
| "rewards/format_reward": 0.25, | |
| "step": 3 | |
| }, | |
| { | |
| "completion_length": 3245.416748046875, | |
| "epoch": 0.004571428571428572, | |
| "grad_norm": 0.16615347564220428, | |
| "kl": 2.9280781745910645e-05, | |
| "learning_rate": 8e-08, | |
| "loss": 0.0, | |
| "reward": -0.25264428183436394, | |
| "reward_std": 0.4561151713132858, | |
| "rewards/cosine_scaled_reward": -0.24090547114610672, | |
| "rewards/format_reward": 0.22916667349636555, | |
| "step": 4 | |
| }, | |
| { | |
| "completion_length": 2911.3334350585938, | |
| "epoch": 0.005714285714285714, | |
| "grad_norm": 0.21166956424713135, | |
| "kl": 3.331899642944336e-05, | |
| "learning_rate": 1e-07, | |
| "loss": 0.0, | |
| "reward": 0.8400040492415428, | |
| "reward_std": 0.885560505092144, | |
| "rewards/cosine_scaled_reward": 0.18041866831481457, | |
| "rewards/format_reward": 0.479166679084301, | |
| "step": 5 | |
| }, | |
| { | |
| "completion_length": 2720.89599609375, | |
| "epoch": 0.006857142857142857, | |
| "grad_norm": 0.23326514661312103, | |
| "kl": 4.035234451293945e-05, | |
| "learning_rate": 1.2e-07, | |
| "loss": 0.0, | |
| "reward": 0.2041575275361538, | |
| "reward_std": 0.6658071056008339, | |
| "rewards/cosine_scaled_reward": -0.11667125090025365, | |
| "rewards/format_reward": 0.4375, | |
| "step": 6 | |
| }, | |
| { | |
| "completion_length": 2360.6458740234375, | |
| "epoch": 0.008, | |
| "grad_norm": 0.2280312329530716, | |
| "kl": 1.850724220275879e-05, | |
| "learning_rate": 1.4e-07, | |
| "loss": 0.0, | |
| "reward": 0.7341702952980995, | |
| "reward_std": 0.44598812609910965, | |
| "rewards/cosine_scaled_reward": 0.09625181555747986, | |
| "rewards/format_reward": 0.5416666865348816, | |
| "step": 7 | |
| }, | |
| { | |
| "completion_length": 2888.8750610351562, | |
| "epoch": 0.009142857142857144, | |
| "grad_norm": 0.20181676745414734, | |
| "kl": 2.9474496841430664e-05, | |
| "learning_rate": 1.6e-07, | |
| "loss": 0.0, | |
| "reward": -0.007411351427435875, | |
| "reward_std": 0.6588219478726387, | |
| "rewards/cosine_scaled_reward": -0.2016223482787609, | |
| "rewards/format_reward": 0.3958333395421505, | |
| "step": 8 | |
| }, | |
| { | |
| "completion_length": 3309.8541870117188, | |
| "epoch": 0.010285714285714285, | |
| "grad_norm": 0.18089492619037628, | |
| "kl": 4.0411949157714844e-05, | |
| "learning_rate": 1.8e-07, | |
| "loss": 0.0, | |
| "reward": 0.08656559139490128, | |
| "reward_std": 0.7023323476314545, | |
| "rewards/cosine_scaled_reward": -0.060883864760398865, | |
| "rewards/format_reward": 0.2083333358168602, | |
| "step": 9 | |
| }, | |
| { | |
| "completion_length": 2354.729217529297, | |
| "epoch": 0.011428571428571429, | |
| "grad_norm": 0.2087497115135193, | |
| "kl": 3.822147846221924e-05, | |
| "learning_rate": 2e-07, | |
| "loss": 0.0, | |
| "reward": 0.43211155757308006, | |
| "reward_std": 0.7549905180931091, | |
| "rewards/cosine_scaled_reward": -0.054777566343545914, | |
| "rewards/format_reward": 0.5416666679084301, | |
| "step": 10 | |
| }, | |
| { | |
| "completion_length": 2589.041778564453, | |
| "epoch": 0.012571428571428572, | |
| "grad_norm": 0.2392556220293045, | |
| "kl": 4.547834396362305e-05, | |
| "learning_rate": 2.1999999999999998e-07, | |
| "loss": 0.0, | |
| "reward": 0.570576427038759, | |
| "reward_std": 1.0850151628255844, | |
| "rewards/cosine_scaled_reward": -0.006378462538123131, | |
| "rewards/format_reward": 0.5833333432674408, | |
| "step": 11 | |
| }, | |
| { | |
| "completion_length": 2472.4375610351562, | |
| "epoch": 0.013714285714285714, | |
| "grad_norm": 0.2759428322315216, | |
| "kl": 2.5950372219085693e-05, | |
| "learning_rate": 2.4e-07, | |
| "loss": 0.0, | |
| "reward": 0.7166856527328491, | |
| "reward_std": 0.7806050479412079, | |
| "rewards/cosine_scaled_reward": 0.056259457021951675, | |
| "rewards/format_reward": 0.6041666716337204, | |
| "step": 12 | |
| }, | |
| { | |
| "completion_length": 2164.854232788086, | |
| "epoch": 0.014857142857142857, | |
| "grad_norm": 0.1908554881811142, | |
| "kl": 2.492964267730713e-05, | |
| "learning_rate": 2.6e-07, | |
| "loss": 0.0, | |
| "reward": 0.7641210779547691, | |
| "reward_std": 0.6774896830320358, | |
| "rewards/cosine_scaled_reward": 0.10081052035093307, | |
| "rewards/format_reward": 0.5625, | |
| "step": 13 | |
| }, | |
| { | |
| "completion_length": 2820.2501220703125, | |
| "epoch": 0.016, | |
| "grad_norm": 0.18801091611385345, | |
| "kl": 3.5822391510009766e-05, | |
| "learning_rate": 2.8e-07, | |
| "loss": 0.0, | |
| "reward": -0.07261240109801292, | |
| "reward_std": 0.6130082383751869, | |
| "rewards/cosine_scaled_reward": -0.2133895456790924, | |
| "rewards/format_reward": 0.3541666753590107, | |
| "step": 14 | |
| }, | |
| { | |
| "completion_length": 3089.104248046875, | |
| "epoch": 0.017142857142857144, | |
| "grad_norm": 0.19688484072685242, | |
| "kl": 3.3468008041381836e-05, | |
| "learning_rate": 3e-07, | |
| "loss": 0.0, | |
| "reward": 0.27143352539860643, | |
| "reward_std": 0.8015548288822174, | |
| "rewards/cosine_scaled_reward": 0.0003000907599925995, | |
| "rewards/format_reward": 0.2708333469927311, | |
| "step": 15 | |
| }, | |
| { | |
| "completion_length": 2362.6876220703125, | |
| "epoch": 0.018285714285714287, | |
| "grad_norm": 0.24537329375743866, | |
| "kl": 2.1159648895263672e-05, | |
| "learning_rate": 3.2e-07, | |
| "loss": 0.0, | |
| "reward": 0.7649998441338539, | |
| "reward_std": 1.021081954240799, | |
| "rewards/cosine_scaled_reward": 0.11166658625006676, | |
| "rewards/format_reward": 0.5416666865348816, | |
| "step": 16 | |
| }, | |
| { | |
| "completion_length": 3128.25, | |
| "epoch": 0.019428571428571427, | |
| "grad_norm": 0.20609241724014282, | |
| "kl": 4.242360591888428e-05, | |
| "learning_rate": 3.4000000000000003e-07, | |
| "loss": 0.0, | |
| "reward": -0.19378644227981567, | |
| "reward_std": 0.5115947872400284, | |
| "rewards/cosine_scaled_reward": -0.21147656068205833, | |
| "rewards/format_reward": 0.2291666716337204, | |
| "step": 17 | |
| }, | |
| { | |
| "completion_length": 2980.9793090820312, | |
| "epoch": 0.02057142857142857, | |
| "grad_norm": 0.2714909315109253, | |
| "kl": 3.966689109802246e-05, | |
| "learning_rate": 3.6e-07, | |
| "loss": 0.0, | |
| "reward": -0.022395100444555283, | |
| "reward_std": 0.6723635420203209, | |
| "rewards/cosine_scaled_reward": -0.16744754649698734, | |
| "rewards/format_reward": 0.3125000037252903, | |
| "step": 18 | |
| }, | |
| { | |
| "completion_length": 3212.604248046875, | |
| "epoch": 0.021714285714285714, | |
| "grad_norm": 0.17160819470882416, | |
| "kl": 3.719329833984375e-05, | |
| "learning_rate": 3.7999999999999996e-07, | |
| "loss": 0.0, | |
| "reward": 0.07102994620800018, | |
| "reward_std": 0.8850104063749313, | |
| "rewards/cosine_scaled_reward": -0.1207350417971611, | |
| "rewards/format_reward": 0.31250001303851604, | |
| "step": 19 | |
| }, | |
| { | |
| "completion_length": 2613.6041870117188, | |
| "epoch": 0.022857142857142857, | |
| "grad_norm": 0.24837209284305573, | |
| "kl": 3.3915042877197266e-05, | |
| "learning_rate": 4e-07, | |
| "loss": 0.0, | |
| "reward": 0.33390188589692116, | |
| "reward_std": 0.713263601064682, | |
| "rewards/cosine_scaled_reward": -0.08304904773831367, | |
| "rewards/format_reward": 0.5000000149011612, | |
| "step": 20 | |
| }, | |
| { | |
| "completion_length": 2582.041748046875, | |
| "epoch": 0.024, | |
| "grad_norm": 0.2642858326435089, | |
| "kl": 2.1037645637989044e-05, | |
| "learning_rate": 4.1999999999999995e-07, | |
| "loss": 0.0, | |
| "reward": 0.2965797185897827, | |
| "reward_std": 0.5356749221682549, | |
| "rewards/cosine_scaled_reward": -0.06004347978159785, | |
| "rewards/format_reward": 0.41666667722165585, | |
| "step": 21 | |
| }, | |
| { | |
| "completion_length": 3307.6459350585938, | |
| "epoch": 0.025142857142857144, | |
| "grad_norm": 0.22419147193431854, | |
| "kl": 4.1961669921875e-05, | |
| "learning_rate": 4.3999999999999997e-07, | |
| "loss": 0.0, | |
| "reward": 0.25835999101400375, | |
| "reward_std": 1.1261206567287445, | |
| "rewards/cosine_scaled_reward": -0.04790334962308407, | |
| "rewards/format_reward": 0.3541666716337204, | |
| "step": 22 | |
| }, | |
| { | |
| "completion_length": 3249.1876220703125, | |
| "epoch": 0.026285714285714287, | |
| "grad_norm": 0.19173863530158997, | |
| "kl": 4.4405460357666016e-05, | |
| "learning_rate": 4.6e-07, | |
| "loss": 0.0, | |
| "reward": 0.27271851897239685, | |
| "reward_std": 0.7990642189979553, | |
| "rewards/cosine_scaled_reward": -0.04072406329214573, | |
| "rewards/format_reward": 0.3541666828095913, | |
| "step": 23 | |
| }, | |
| { | |
| "completion_length": 2154.25, | |
| "epoch": 0.027428571428571427, | |
| "grad_norm": 0.259212851524353, | |
| "kl": 1.8768012523651123e-05, | |
| "learning_rate": 4.8e-07, | |
| "loss": 0.0, | |
| "reward": 0.5731075070798397, | |
| "reward_std": 0.8421577215194702, | |
| "rewards/cosine_scaled_reward": -0.02594624925404787, | |
| "rewards/format_reward": 0.6250000074505806, | |
| "step": 24 | |
| }, | |
| { | |
| "completion_length": 2784.7916870117188, | |
| "epoch": 0.02857142857142857, | |
| "grad_norm": 0.29162946343421936, | |
| "kl": 3.090500831604004e-05, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0, | |
| "reward": 0.1705078724771738, | |
| "reward_std": 0.6685621440410614, | |
| "rewards/cosine_scaled_reward": -0.08141273260116577, | |
| "rewards/format_reward": 0.3333333358168602, | |
| "step": 25 | |
| }, | |
| { | |
| "completion_length": 3185.729248046875, | |
| "epoch": 0.029714285714285714, | |
| "grad_norm": 0.15754370391368866, | |
| "kl": 2.549588680267334e-05, | |
| "learning_rate": 5.2e-07, | |
| "loss": 0.0, | |
| "reward": 0.13516026688739657, | |
| "reward_std": 0.6272664293646812, | |
| "rewards/cosine_scaled_reward": -0.05741987004876137, | |
| "rewards/format_reward": 0.2500000111758709, | |
| "step": 26 | |
| }, | |
| { | |
| "completion_length": 3129.2083740234375, | |
| "epoch": 0.030857142857142857, | |
| "grad_norm": 0.16376672685146332, | |
| "kl": 2.86102294921875e-05, | |
| "learning_rate": 5.4e-07, | |
| "loss": 0.0, | |
| "reward": 0.0651654414832592, | |
| "reward_std": 0.5805819556117058, | |
| "rewards/cosine_scaled_reward": -0.08200062438845634, | |
| "rewards/format_reward": 0.22916667722165585, | |
| "step": 27 | |
| }, | |
| { | |
| "completion_length": 3173.4791870117188, | |
| "epoch": 0.032, | |
| "grad_norm": 0.2187095433473587, | |
| "kl": 3.802776336669922e-05, | |
| "learning_rate": 5.6e-07, | |
| "loss": 0.0, | |
| "reward": 0.1244891807436943, | |
| "reward_std": 0.8137174546718597, | |
| "rewards/cosine_scaled_reward": -0.07317209523171186, | |
| "rewards/format_reward": 0.27083334885537624, | |
| "step": 28 | |
| }, | |
| { | |
| "completion_length": 3206.0208740234375, | |
| "epoch": 0.03314285714285714, | |
| "grad_norm": 0.15626287460327148, | |
| "kl": 1.7024576663970947e-05, | |
| "learning_rate": 5.8e-07, | |
| "loss": 0.0, | |
| "reward": -0.0882865646854043, | |
| "reward_std": 0.6182056441903114, | |
| "rewards/cosine_scaled_reward": -0.13789328234270215, | |
| "rewards/format_reward": 0.1875000111758709, | |
| "step": 29 | |
| }, | |
| { | |
| "completion_length": 3293.979248046875, | |
| "epoch": 0.03428571428571429, | |
| "grad_norm": 0.176454558968544, | |
| "kl": 2.8930604457855225e-05, | |
| "learning_rate": 6e-07, | |
| "loss": 0.0, | |
| "reward": 0.12017922103404999, | |
| "reward_std": 0.8002806901931763, | |
| "rewards/cosine_scaled_reward": -0.10657705180346966, | |
| "rewards/format_reward": 0.33333334885537624, | |
| "step": 30 | |
| }, | |
| { | |
| "completion_length": 2556.0625610351562, | |
| "epoch": 0.03542857142857143, | |
| "grad_norm": 0.2976699471473694, | |
| "kl": 3.6090612411499023e-05, | |
| "learning_rate": 6.2e-07, | |
| "loss": 0.0, | |
| "reward": 0.13020960986614227, | |
| "reward_std": 0.5589020624756813, | |
| "rewards/cosine_scaled_reward": -0.15364519506692886, | |
| "rewards/format_reward": 0.43750001303851604, | |
| "step": 31 | |
| }, | |
| { | |
| "completion_length": 3466.125, | |
| "epoch": 0.036571428571428574, | |
| "grad_norm": 0.15761366486549377, | |
| "kl": 3.0994415283203125e-05, | |
| "learning_rate": 6.4e-07, | |
| "loss": 0.0, | |
| "reward": -0.34774322621524334, | |
| "reward_std": 0.4613388404250145, | |
| "rewards/cosine_scaled_reward": -0.22595495358109474, | |
| "rewards/format_reward": 0.1041666679084301, | |
| "step": 32 | |
| }, | |
| { | |
| "completion_length": 3078.729248046875, | |
| "epoch": 0.037714285714285714, | |
| "grad_norm": 0.17744146287441254, | |
| "kl": 1.9311904907226562e-05, | |
| "learning_rate": 6.6e-07, | |
| "loss": 0.0, | |
| "reward": 0.1523735709488392, | |
| "reward_std": 0.7702403217554092, | |
| "rewards/cosine_scaled_reward": -0.10089654847979546, | |
| "rewards/format_reward": 0.354166679084301, | |
| "step": 33 | |
| }, | |
| { | |
| "completion_length": 3068.2083740234375, | |
| "epoch": 0.038857142857142854, | |
| "grad_norm": 0.2183830887079239, | |
| "kl": 1.940131187438965e-05, | |
| "learning_rate": 6.800000000000001e-07, | |
| "loss": 0.0, | |
| "reward": 0.029434625059366226, | |
| "reward_std": 0.7817529812455177, | |
| "rewards/cosine_scaled_reward": -0.1415326923597604, | |
| "rewards/format_reward": 0.3125000111758709, | |
| "step": 34 | |
| }, | |
| { | |
| "completion_length": 3028.916748046875, | |
| "epoch": 0.04, | |
| "grad_norm": 0.1779097616672516, | |
| "kl": 2.0578503608703613e-05, | |
| "learning_rate": 7e-07, | |
| "loss": 0.0, | |
| "reward": 0.2327469252049923, | |
| "reward_std": 0.9538670182228088, | |
| "rewards/cosine_scaled_reward": -0.09195987693965435, | |
| "rewards/format_reward": 0.41666667349636555, | |
| "step": 35 | |
| }, | |
| { | |
| "completion_length": 2689.3959350585938, | |
| "epoch": 0.04114285714285714, | |
| "grad_norm": 0.16330143809318542, | |
| "kl": 4.976987838745117e-05, | |
| "learning_rate": 7.2e-07, | |
| "loss": 0.0, | |
| "reward": 0.5622920989990234, | |
| "reward_std": 0.39920446276664734, | |
| "rewards/cosine_scaled_reward": 0.07281268946826458, | |
| "rewards/format_reward": 0.4166666716337204, | |
| "step": 36 | |
| }, | |
| { | |
| "completion_length": 2801.6459045410156, | |
| "epoch": 0.04228571428571429, | |
| "grad_norm": 0.19838035106658936, | |
| "kl": 4.4226646423339844e-05, | |
| "learning_rate": 7.4e-07, | |
| "loss": 0.0, | |
| "reward": 0.41371238604187965, | |
| "reward_std": 0.575165145099163, | |
| "rewards/cosine_scaled_reward": -0.07439382094889879, | |
| "rewards/format_reward": 0.5625000149011612, | |
| "step": 37 | |
| }, | |
| { | |
| "completion_length": 3009.5000610351562, | |
| "epoch": 0.04342857142857143, | |
| "grad_norm": 0.1800134778022766, | |
| "kl": 6.097555160522461e-05, | |
| "learning_rate": 7.599999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 0.12262389855459332, | |
| "reward_std": 0.6652626991271973, | |
| "rewards/cosine_scaled_reward": -0.15743806213140488, | |
| "rewards/format_reward": 0.4375000149011612, | |
| "step": 38 | |
| }, | |
| { | |
| "completion_length": 3139.604248046875, | |
| "epoch": 0.044571428571428574, | |
| "grad_norm": 0.23411938548088074, | |
| "kl": 4.2323023080825806e-05, | |
| "learning_rate": 7.799999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 0.24589091911911964, | |
| "reward_std": 0.8911770880222321, | |
| "rewards/cosine_scaled_reward": -0.07497121207416058, | |
| "rewards/format_reward": 0.3958333395421505, | |
| "step": 39 | |
| }, | |
| { | |
| "completion_length": 3011.8958740234375, | |
| "epoch": 0.045714285714285714, | |
| "grad_norm": 0.1625184565782547, | |
| "kl": 4.693865776062012e-05, | |
| "learning_rate": 8e-07, | |
| "loss": 0.0, | |
| "reward": 0.12772860191762447, | |
| "reward_std": 0.7576778829097748, | |
| "rewards/cosine_scaled_reward": -0.12363571301102638, | |
| "rewards/format_reward": 0.3750000037252903, | |
| "step": 40 | |
| }, | |
| { | |
| "completion_length": 3124.2500610351562, | |
| "epoch": 0.046857142857142854, | |
| "grad_norm": 0.17387458682060242, | |
| "kl": 1.8164515495300293e-05, | |
| "learning_rate": 8.199999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 0.21644378546625376, | |
| "reward_std": 0.6694483831524849, | |
| "rewards/cosine_scaled_reward": -0.06886144913733006, | |
| "rewards/format_reward": 0.354166679084301, | |
| "step": 41 | |
| }, | |
| { | |
| "completion_length": 2182.5000610351562, | |
| "epoch": 0.048, | |
| "grad_norm": 0.34351351857185364, | |
| "kl": 0.00022931396961212158, | |
| "learning_rate": 8.399999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 0.6580724753439426, | |
| "reward_std": 0.8123672902584076, | |
| "rewards/cosine_scaled_reward": 0.016536223702132702, | |
| "rewards/format_reward": 0.6250000111758709, | |
| "step": 42 | |
| }, | |
| { | |
| "completion_length": 2828.7709350585938, | |
| "epoch": 0.04914285714285714, | |
| "grad_norm": 0.19178950786590576, | |
| "kl": 3.0837953090667725e-05, | |
| "learning_rate": 8.599999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 0.31538891326636076, | |
| "reward_std": 0.8877717405557632, | |
| "rewards/cosine_scaled_reward": -0.06105554662644863, | |
| "rewards/format_reward": 0.4375000149011612, | |
| "step": 43 | |
| }, | |
| { | |
| "completion_length": 3036.6250610351562, | |
| "epoch": 0.05028571428571429, | |
| "grad_norm": 0.22349753975868225, | |
| "kl": 0.0003269314765930176, | |
| "learning_rate": 8.799999999999999e-07, | |
| "loss": 0.0, | |
| "reward": -0.01474527781829238, | |
| "reward_std": 0.5097765326499939, | |
| "rewards/cosine_scaled_reward": -0.15320597402751446, | |
| "rewards/format_reward": 0.291666679084301, | |
| "step": 44 | |
| }, | |
| { | |
| "completion_length": 2778.5000610351562, | |
| "epoch": 0.05142857142857143, | |
| "grad_norm": 0.18280261754989624, | |
| "kl": 5.6609511375427246e-05, | |
| "learning_rate": 9e-07, | |
| "loss": 0.0, | |
| "reward": 0.8612850233912468, | |
| "reward_std": 1.1412108689546585, | |
| "rewards/cosine_scaled_reward": 0.13897587358951569, | |
| "rewards/format_reward": 0.583333358168602, | |
| "step": 45 | |
| }, | |
| { | |
| "completion_length": 2998.229248046875, | |
| "epoch": 0.052571428571428575, | |
| "grad_norm": 0.16858145594596863, | |
| "kl": 5.383044481277466e-05, | |
| "learning_rate": 9.2e-07, | |
| "loss": 0.0, | |
| "reward": 0.28540395572781563, | |
| "reward_std": 0.43213801458477974, | |
| "rewards/cosine_scaled_reward": -0.02396468259394169, | |
| "rewards/format_reward": 0.3333333358168602, | |
| "step": 46 | |
| }, | |
| { | |
| "completion_length": 2551.6458587646484, | |
| "epoch": 0.053714285714285714, | |
| "grad_norm": 0.23799559473991394, | |
| "kl": 9.156018495559692e-05, | |
| "learning_rate": 9.399999999999999e-07, | |
| "loss": 0.0, | |
| "reward": 0.44500812888145447, | |
| "reward_std": 0.783466711640358, | |
| "rewards/cosine_scaled_reward": -0.04832928255200386, | |
| "rewards/format_reward": 0.5416666865348816, | |
| "step": 47 | |
| }, | |
| { | |
| "completion_length": 2939.1041870117188, | |
| "epoch": 0.054857142857142854, | |
| "grad_norm": 0.18564291298389435, | |
| "kl": 0.00010335445404052734, | |
| "learning_rate": 9.6e-07, | |
| "loss": 0.0, | |
| "reward": 0.34564049541950226, | |
| "reward_std": 0.9494538530707359, | |
| "rewards/cosine_scaled_reward": -0.02509642019867897, | |
| "rewards/format_reward": 0.3958333395421505, | |
| "step": 48 | |
| }, | |
| { | |
| "completion_length": 2282.895835876465, | |
| "epoch": 0.056, | |
| "grad_norm": 0.23054172098636627, | |
| "kl": 0.00024145841598510742, | |
| "learning_rate": 9.8e-07, | |
| "loss": 0.0, | |
| "reward": 0.44885391741991043, | |
| "reward_std": 0.7631752789020538, | |
| "rewards/cosine_scaled_reward": -0.06723971478641033, | |
| "rewards/format_reward": 0.5833333358168602, | |
| "step": 49 | |
| }, | |
| { | |
| "completion_length": 2204.1041870117188, | |
| "epoch": 0.05714285714285714, | |
| "grad_norm": 0.29597678780555725, | |
| "kl": 0.0005988925695419312, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0, | |
| "reward": 0.5718545913696289, | |
| "reward_std": 0.6146985068917274, | |
| "rewards/cosine_scaled_reward": 0.02551062125712633, | |
| "rewards/format_reward": 0.5208333432674408, | |
| "step": 50 | |
| }, | |
| { | |
| "completion_length": 3201.6666870117188, | |
| "epoch": 0.05828571428571429, | |
| "grad_norm": 0.15334689617156982, | |
| "kl": 8.338689804077148e-05, | |
| "learning_rate": 9.999890338174275e-07, | |
| "loss": 0.0, | |
| "reward": 0.14526839554309845, | |
| "reward_std": 0.8655073121190071, | |
| "rewards/cosine_scaled_reward": -0.07319913152605295, | |
| "rewards/format_reward": 0.2916666716337204, | |
| "step": 51 | |
| }, | |
| { | |
| "completion_length": 3215.0000610351562, | |
| "epoch": 0.05942857142857143, | |
| "grad_norm": 0.17531076073646545, | |
| "kl": 0.0001531541347503662, | |
| "learning_rate": 9.999561358041868e-07, | |
| "loss": 0.0, | |
| "reward": -0.015788130462169647, | |
| "reward_std": 0.7165202274918556, | |
| "rewards/cosine_scaled_reward": -0.13289407594129443, | |
| "rewards/format_reward": 0.2500000074505806, | |
| "step": 52 | |
| }, | |
| { | |
| "completion_length": 2991.8125610351562, | |
| "epoch": 0.060571428571428575, | |
| "grad_norm": 0.28014782071113586, | |
| "kl": 0.000295490026473999, | |
| "learning_rate": 9.999013075636804e-07, | |
| "loss": 0.0, | |
| "reward": -0.08945630304515362, | |
| "reward_std": 0.6164149194955826, | |
| "rewards/cosine_scaled_reward": -0.1905614770948887, | |
| "rewards/format_reward": 0.2916666828095913, | |
| "step": 53 | |
| }, | |
| { | |
| "completion_length": 2437.4375, | |
| "epoch": 0.061714285714285715, | |
| "grad_norm": 0.21041618287563324, | |
| "kl": 0.00014019012451171875, | |
| "learning_rate": 9.998245517681593e-07, | |
| "loss": 0.0, | |
| "reward": 0.08023202046751976, | |
| "reward_std": 0.43379058688879013, | |
| "rewards/cosine_scaled_reward": -0.18905067443847656, | |
| "rewards/format_reward": 0.4583333432674408, | |
| "step": 54 | |
| }, | |
| { | |
| "completion_length": 3115.2501220703125, | |
| "epoch": 0.06285714285714286, | |
| "grad_norm": 0.18965038657188416, | |
| "kl": 0.00021153688430786133, | |
| "learning_rate": 9.997258721585931e-07, | |
| "loss": 0.0, | |
| "reward": 0.14397013932466507, | |
| "reward_std": 0.7335802316665649, | |
| "rewards/cosine_scaled_reward": -0.07384827360510826, | |
| "rewards/format_reward": 0.291666679084301, | |
| "step": 55 | |
| }, | |
| { | |
| "completion_length": 3097.0625, | |
| "epoch": 0.064, | |
| "grad_norm": 0.1861303448677063, | |
| "kl": 0.0006959438323974609, | |
| "learning_rate": 9.996052735444862e-07, | |
| "loss": 0.0, | |
| "reward": 0.21384139358997345, | |
| "reward_std": 0.7854617610573769, | |
| "rewards/cosine_scaled_reward": -0.038912639021873474, | |
| "rewards/format_reward": 0.29166667722165585, | |
| "step": 56 | |
| }, | |
| { | |
| "completion_length": 2959.479248046875, | |
| "epoch": 0.06514285714285714, | |
| "grad_norm": 0.17092932760715485, | |
| "kl": 0.0004864931106567383, | |
| "learning_rate": 9.994627618036452e-07, | |
| "loss": 0.0, | |
| "reward": -0.0896548442542553, | |
| "reward_std": 0.597286906093359, | |
| "rewards/cosine_scaled_reward": -0.22191076539456844, | |
| "rewards/format_reward": 0.3541666716337204, | |
| "step": 57 | |
| }, | |
| { | |
| "completion_length": 3173.9583740234375, | |
| "epoch": 0.06628571428571428, | |
| "grad_norm": 0.16764329373836517, | |
| "kl": 0.0009320974349975586, | |
| "learning_rate": 9.992983438818915e-07, | |
| "loss": 0.0, | |
| "reward": -0.259409268386662, | |
| "reward_std": 0.41252440214157104, | |
| "rewards/cosine_scaled_reward": -0.22345462441444397, | |
| "rewards/format_reward": 0.1875, | |
| "step": 58 | |
| }, | |
| { | |
| "completion_length": 2903.4375610351562, | |
| "epoch": 0.06742857142857143, | |
| "grad_norm": 0.2438689023256302, | |
| "kl": 0.0004966259002685547, | |
| "learning_rate": 9.991120277927223e-07, | |
| "loss": 0.0, | |
| "reward": 0.5037799216806889, | |
| "reward_std": 0.6180380508303642, | |
| "rewards/cosine_scaled_reward": 0.05397331342101097, | |
| "rewards/format_reward": 0.39583334513008595, | |
| "step": 59 | |
| }, | |
| { | |
| "completion_length": 2846.9375610351562, | |
| "epoch": 0.06857142857142857, | |
| "grad_norm": 0.19560052454471588, | |
| "kl": 0.0006766319274902344, | |
| "learning_rate": 9.989038226169207e-07, | |
| "loss": 0.0, | |
| "reward": 0.619435504078865, | |
| "reward_std": 0.54927659034729, | |
| "rewards/cosine_scaled_reward": 0.12221772782504559, | |
| "rewards/format_reward": 0.3750000149011612, | |
| "step": 60 | |
| }, | |
| { | |
| "completion_length": 2432.729248046875, | |
| "epoch": 0.06971428571428571, | |
| "grad_norm": 0.18966424465179443, | |
| "kl": 0.00019347667694091797, | |
| "learning_rate": 9.98673738502114e-07, | |
| "loss": 0.0, | |
| "reward": 0.8035399168729782, | |
| "reward_std": 0.6529746800661087, | |
| "rewards/cosine_scaled_reward": 0.12051995098590851, | |
| "rewards/format_reward": 0.5625000149011612, | |
| "step": 61 | |
| }, | |
| { | |
| "completion_length": 3356.4583740234375, | |
| "epoch": 0.07085714285714285, | |
| "grad_norm": 0.1480625420808792, | |
| "kl": 0.0005307793617248535, | |
| "learning_rate": 9.98421786662277e-07, | |
| "loss": 0.0, | |
| "reward": 0.33844682574272156, | |
| "reward_std": 0.7905219346284866, | |
| "rewards/cosine_scaled_reward": 0.023390088230371475, | |
| "rewards/format_reward": 0.29166667722165585, | |
| "step": 62 | |
| }, | |
| { | |
| "completion_length": 1956.7084350585938, | |
| "epoch": 0.072, | |
| "grad_norm": 0.26979929208755493, | |
| "kl": 0.006456255912780762, | |
| "learning_rate": 9.981479793771866e-07, | |
| "loss": 0.0003, | |
| "reward": 0.7974750846624374, | |
| "reward_std": 0.7315621674060822, | |
| "rewards/cosine_scaled_reward": 0.05498753860592842, | |
| "rewards/format_reward": 0.6875000149011612, | |
| "step": 63 | |
| }, | |
| { | |
| "completion_length": 2895.8958740234375, | |
| "epoch": 0.07314285714285715, | |
| "grad_norm": 0.14358438551425934, | |
| "kl": 0.00026297569274902344, | |
| "learning_rate": 9.97852329991824e-07, | |
| "loss": 0.0, | |
| "reward": 0.6833263337612152, | |
| "reward_std": 0.5810272544622421, | |
| "rewards/cosine_scaled_reward": 0.12291315197944641, | |
| "rewards/format_reward": 0.4375000149011612, | |
| "step": 64 | |
| }, | |
| { | |
| "completion_length": 2582.1875610351562, | |
| "epoch": 0.07428571428571429, | |
| "grad_norm": 0.1719641089439392, | |
| "kl": 0.0006394386291503906, | |
| "learning_rate": 9.975348529157229e-07, | |
| "loss": 0.0, | |
| "reward": 0.4943835213780403, | |
| "reward_std": 0.9102021306753159, | |
| "rewards/cosine_scaled_reward": -0.002808244898915291, | |
| "rewards/format_reward": 0.5000000074505806, | |
| "step": 65 | |
| }, | |
| { | |
| "completion_length": 3066.2709350585938, | |
| "epoch": 0.07542857142857143, | |
| "grad_norm": 0.15914808213710785, | |
| "kl": 0.00035455822944641113, | |
| "learning_rate": 9.971955636222684e-07, | |
| "loss": 0.0, | |
| "reward": 0.270260289311409, | |
| "reward_std": 0.6933658719062805, | |
| "rewards/cosine_scaled_reward": -0.10445320140570402, | |
| "rewards/format_reward": 0.47916667722165585, | |
| "step": 66 | |
| }, | |
| { | |
| "completion_length": 3021.1666870117188, | |
| "epoch": 0.07657142857142857, | |
| "grad_norm": 0.15889614820480347, | |
| "kl": 0.0007028579711914062, | |
| "learning_rate": 9.968344786479415e-07, | |
| "loss": 0.0, | |
| "reward": 0.2754221335053444, | |
| "reward_std": 0.6702268719673157, | |
| "rewards/cosine_scaled_reward": -0.028955606278032064, | |
| "rewards/format_reward": 0.33333334140479565, | |
| "step": 67 | |
| }, | |
| { | |
| "completion_length": 2520.7291870117188, | |
| "epoch": 0.07771428571428571, | |
| "grad_norm": 0.25743117928504944, | |
| "kl": 0.001796722412109375, | |
| "learning_rate": 9.964516155915151e-07, | |
| "loss": 0.0001, | |
| "reward": 0.2481890171766281, | |
| "reward_std": 0.4365886226296425, | |
| "rewards/cosine_scaled_reward": -0.06340551376342773, | |
| "rewards/format_reward": 0.375, | |
| "step": 68 | |
| }, | |
| { | |
| "completion_length": 2519.2500610351562, | |
| "epoch": 0.07885714285714286, | |
| "grad_norm": 0.1728557050228119, | |
| "kl": 0.0004031658172607422, | |
| "learning_rate": 9.960469931131936e-07, | |
| "loss": 0.0, | |
| "reward": 0.4316702373325825, | |
| "reward_std": 0.455346904695034, | |
| "rewards/cosine_scaled_reward": -0.02374822273850441, | |
| "rewards/format_reward": 0.4791666716337204, | |
| "step": 69 | |
| }, | |
| { | |
| "completion_length": 3165.7708740234375, | |
| "epoch": 0.08, | |
| "grad_norm": 0.13548843562602997, | |
| "kl": 0.0004048347473144531, | |
| "learning_rate": 9.956206309337066e-07, | |
| "loss": 0.0, | |
| "reward": 0.46206507831811905, | |
| "reward_std": 0.8794394135475159, | |
| "rewards/cosine_scaled_reward": -0.008550799917429686, | |
| "rewards/format_reward": 0.4791666716337204, | |
| "step": 70 | |
| }, | |
| { | |
| "completion_length": 2529.8751220703125, | |
| "epoch": 0.08114285714285714, | |
| "grad_norm": 0.22490063309669495, | |
| "kl": 0.003349781036376953, | |
| "learning_rate": 9.951725498333448e-07, | |
| "loss": 0.0001, | |
| "reward": 0.5197920426726341, | |
| "reward_std": 0.823016032576561, | |
| "rewards/cosine_scaled_reward": -0.010937327519059181, | |
| "rewards/format_reward": 0.5416666865348816, | |
| "step": 71 | |
| }, | |
| { | |
| "completion_length": 2654.4791870117188, | |
| "epoch": 0.08228571428571428, | |
| "grad_norm": 0.20955628156661987, | |
| "kl": 0.0012530684471130371, | |
| "learning_rate": 9.947027716509488e-07, | |
| "loss": 0.0001, | |
| "reward": 0.7414695173501968, | |
| "reward_std": 0.5663128644227982, | |
| "rewards/cosine_scaled_reward": 0.05823474656790495, | |
| "rewards/format_reward": 0.625, | |
| "step": 72 | |
| }, | |
| { | |
| "completion_length": 1640.041748046875, | |
| "epoch": 0.08342857142857144, | |
| "grad_norm": 0.2790416479110718, | |
| "kl": 0.005096435546875, | |
| "learning_rate": 9.942113192828444e-07, | |
| "loss": 0.0002, | |
| "reward": 0.91811203956604, | |
| "reward_std": 0.8141122311353683, | |
| "rewards/cosine_scaled_reward": 0.08405601512640715, | |
| "rewards/format_reward": 0.75, | |
| "step": 73 | |
| }, | |
| { | |
| "completion_length": 2470.5833740234375, | |
| "epoch": 0.08457142857142858, | |
| "grad_norm": 0.1816757470369339, | |
| "kl": 0.001283407211303711, | |
| "learning_rate": 9.93698216681727e-07, | |
| "loss": 0.0001, | |
| "reward": 0.595103541854769, | |
| "reward_std": 0.6585821881890297, | |
| "rewards/cosine_scaled_reward": 0.01630176231265068, | |
| "rewards/format_reward": 0.5625000074505806, | |
| "step": 74 | |
| }, | |
| { | |
| "completion_length": 2660.8750915527344, | |
| "epoch": 0.08571428571428572, | |
| "grad_norm": 0.2641614079475403, | |
| "kl": 0.0019817352294921875, | |
| "learning_rate": 9.931634888554935e-07, | |
| "loss": 0.0001, | |
| "reward": 0.2930721901357174, | |
| "reward_std": 0.7745417281985283, | |
| "rewards/cosine_scaled_reward": -0.06179725006222725, | |
| "rewards/format_reward": 0.4166666679084301, | |
| "step": 75 | |
| }, | |
| { | |
| "completion_length": 2939.604248046875, | |
| "epoch": 0.08685714285714285, | |
| "grad_norm": 0.3027961552143097, | |
| "kl": 0.0008873939514160156, | |
| "learning_rate": 9.926071618660237e-07, | |
| "loss": 0.0, | |
| "reward": 0.3248658664524555, | |
| "reward_std": 0.909210205078125, | |
| "rewards/cosine_scaled_reward": -0.014650408178567886, | |
| "rewards/format_reward": 0.3541666828095913, | |
| "step": 76 | |
| }, | |
| { | |
| "completion_length": 3152.1666870117188, | |
| "epoch": 0.088, | |
| "grad_norm": 0.16062307357788086, | |
| "kl": 0.001007080078125, | |
| "learning_rate": 9.9202926282791e-07, | |
| "loss": 0.0, | |
| "reward": 0.11907588690519333, | |
| "reward_std": 0.613786868751049, | |
| "rewards/cosine_scaled_reward": -0.06546205282211304, | |
| "rewards/format_reward": 0.25000000558793545, | |
| "step": 77 | |
| }, | |
| { | |
| "completion_length": 2635.041717529297, | |
| "epoch": 0.08914285714285715, | |
| "grad_norm": 0.19166067242622375, | |
| "kl": 0.0008903741836547852, | |
| "learning_rate": 9.91429819907136e-07, | |
| "loss": 0.0, | |
| "reward": 0.5825019255280495, | |
| "reward_std": 0.7261854261159897, | |
| "rewards/cosine_scaled_reward": 0.06208430230617523, | |
| "rewards/format_reward": 0.4583333507180214, | |
| "step": 78 | |
| }, | |
| { | |
| "completion_length": 2755.5209350585938, | |
| "epoch": 0.09028571428571429, | |
| "grad_norm": 0.17263904213905334, | |
| "kl": 0.0004019737243652344, | |
| "learning_rate": 9.908088623197048e-07, | |
| "loss": 0.0, | |
| "reward": 0.3458161875605583, | |
| "reward_std": 0.717200756072998, | |
| "rewards/cosine_scaled_reward": -0.056258589029312134, | |
| "rewards/format_reward": 0.4583333432674408, | |
| "step": 79 | |
| }, | |
| { | |
| "completion_length": 3565.4375, | |
| "epoch": 0.09142857142857143, | |
| "grad_norm": 0.1458665281534195, | |
| "kl": 0.00030177831649780273, | |
| "learning_rate": 9.901664203302124e-07, | |
| "loss": 0.0, | |
| "reward": -0.254756236448884, | |
| "reward_std": 0.5783224925398827, | |
| "rewards/cosine_scaled_reward": -0.17946145310997963, | |
| "rewards/format_reward": 0.10416666977107525, | |
| "step": 80 | |
| }, | |
| { | |
| "completion_length": 3124.291748046875, | |
| "epoch": 0.09257142857142857, | |
| "grad_norm": 0.17081034183502197, | |
| "kl": 0.0008420944213867188, | |
| "learning_rate": 9.895025252503755e-07, | |
| "loss": 0.0, | |
| "reward": 0.06758889555931091, | |
| "reward_std": 0.7439121454954147, | |
| "rewards/cosine_scaled_reward": -0.10162222757935524, | |
| "rewards/format_reward": 0.27083333767950535, | |
| "step": 81 | |
| }, | |
| { | |
| "completion_length": 3004.0833740234375, | |
| "epoch": 0.09371428571428571, | |
| "grad_norm": 0.17911851406097412, | |
| "kl": 0.0006622076034545898, | |
| "learning_rate": 9.888172094375033e-07, | |
| "loss": 0.0, | |
| "reward": 0.3707614615559578, | |
| "reward_std": 0.8215866684913635, | |
| "rewards/cosine_scaled_reward": -0.0437859346420737, | |
| "rewards/format_reward": 0.45833334885537624, | |
| "step": 82 | |
| }, | |
| { | |
| "completion_length": 3440.3125, | |
| "epoch": 0.09485714285714286, | |
| "grad_norm": 0.15295840799808502, | |
| "kl": 0.00017774105072021484, | |
| "learning_rate": 9.881105062929221e-07, | |
| "loss": 0.0, | |
| "reward": -0.39509791135787964, | |
| "reward_std": 0.5668186843395233, | |
| "rewards/cosine_scaled_reward": -0.260048970580101, | |
| "rewards/format_reward": 0.125, | |
| "step": 83 | |
| }, | |
| { | |
| "completion_length": 2530.7083740234375, | |
| "epoch": 0.096, | |
| "grad_norm": 0.2006414830684662, | |
| "kl": 0.0003807544708251953, | |
| "learning_rate": 9.873824502603459e-07, | |
| "loss": 0.0, | |
| "reward": 0.7480560662224889, | |
| "reward_std": 1.0157663226127625, | |
| "rewards/cosine_scaled_reward": 0.0927780270576477, | |
| "rewards/format_reward": 0.5625000149011612, | |
| "step": 84 | |
| }, | |
| { | |
| "completion_length": 3185.8541870117188, | |
| "epoch": 0.09714285714285714, | |
| "grad_norm": 0.17822831869125366, | |
| "kl": 0.0009975433349609375, | |
| "learning_rate": 9.866330768241983e-07, | |
| "loss": 0.0, | |
| "reward": 0.07128806412220001, | |
| "reward_std": 0.8152596428990364, | |
| "rewards/cosine_scaled_reward": -0.08935598330572248, | |
| "rewards/format_reward": 0.2500000111758709, | |
| "step": 85 | |
| }, | |
| { | |
| "completion_length": 2827.5625610351562, | |
| "epoch": 0.09828571428571428, | |
| "grad_norm": 0.1663668006658554, | |
| "kl": 0.0010325908660888672, | |
| "learning_rate": 9.85862422507884e-07, | |
| "loss": 0.0, | |
| "reward": 0.19359283335506916, | |
| "reward_std": 0.644717700779438, | |
| "rewards/cosine_scaled_reward": -0.10112026333808899, | |
| "rewards/format_reward": 0.3958333544433117, | |
| "step": 86 | |
| }, | |
| { | |
| "completion_length": 2942.5, | |
| "epoch": 0.09942857142857142, | |
| "grad_norm": 0.1894853264093399, | |
| "kl": 0.0014657974243164062, | |
| "learning_rate": 9.850705248720068e-07, | |
| "loss": 0.0001, | |
| "reward": 0.1782783716917038, | |
| "reward_std": 0.7724725604057312, | |
| "rewards/cosine_scaled_reward": -0.08794412622228265, | |
| "rewards/format_reward": 0.3541666716337204, | |
| "step": 87 | |
| }, | |
| { | |
| "completion_length": 2836.3959350585938, | |
| "epoch": 0.10057142857142858, | |
| "grad_norm": 0.1908150315284729, | |
| "kl": 0.0014390945434570312, | |
| "learning_rate": 9.8425742251254e-07, | |
| "loss": 0.0001, | |
| "reward": 0.3470733240246773, | |
| "reward_std": 0.8534664362668991, | |
| "rewards/cosine_scaled_reward": -0.024379996582865715, | |
| "rewards/format_reward": 0.3958333432674408, | |
| "step": 88 | |
| }, | |
| { | |
| "completion_length": 3278.6041870117188, | |
| "epoch": 0.10171428571428572, | |
| "grad_norm": 0.1539601981639862, | |
| "kl": 0.0004895925521850586, | |
| "learning_rate": 9.83423155058946e-07, | |
| "loss": 0.0, | |
| "reward": 0.30410441011190414, | |
| "reward_std": 0.6761599257588387, | |
| "rewards/cosine_scaled_reward": 0.006218895316123962, | |
| "rewards/format_reward": 0.29166667722165585, | |
| "step": 89 | |
| }, | |
| { | |
| "completion_length": 2985.3958740234375, | |
| "epoch": 0.10285714285714286, | |
| "grad_norm": 0.17722909152507782, | |
| "kl": 0.00044274330139160156, | |
| "learning_rate": 9.825677631722435e-07, | |
| "loss": 0.0, | |
| "reward": 0.2143753319978714, | |
| "reward_std": 0.6936175674200058, | |
| "rewards/cosine_scaled_reward": -0.0594789981842041, | |
| "rewards/format_reward": 0.3333333358168602, | |
| "step": 90 | |
| }, | |
| { | |
| "completion_length": 3058.5000610351562, | |
| "epoch": 0.104, | |
| "grad_norm": 0.19192735850811005, | |
| "kl": 0.0004374980926513672, | |
| "learning_rate": 9.816912885430258e-07, | |
| "loss": 0.0, | |
| "reward": 0.302869388833642, | |
| "reward_std": 0.5636695921421051, | |
| "rewards/cosine_scaled_reward": -0.04648197069764137, | |
| "rewards/format_reward": 0.3958333432674408, | |
| "step": 91 | |
| }, | |
| { | |
| "completion_length": 2699.6250610351562, | |
| "epoch": 0.10514285714285715, | |
| "grad_norm": 0.17412729561328888, | |
| "kl": 0.0012140274047851562, | |
| "learning_rate": 9.807937738894303e-07, | |
| "loss": 0.0, | |
| "reward": 0.5564358681440353, | |
| "reward_std": 0.717531181871891, | |
| "rewards/cosine_scaled_reward": 0.059467924758791924, | |
| "rewards/format_reward": 0.4375000111758709, | |
| "step": 92 | |
| }, | |
| { | |
| "completion_length": 2346.1250610351562, | |
| "epoch": 0.10628571428571429, | |
| "grad_norm": 0.2216739058494568, | |
| "kl": 0.0015277862548828125, | |
| "learning_rate": 9.798752629550546e-07, | |
| "loss": 0.0001, | |
| "reward": 0.5152877140790224, | |
| "reward_std": 0.6053595095872879, | |
| "rewards/cosine_scaled_reward": -0.0027728192508220673, | |
| "rewards/format_reward": 0.5208333358168602, | |
| "step": 93 | |
| }, | |
| { | |
| "completion_length": 3211.8333740234375, | |
| "epoch": 0.10742857142857143, | |
| "grad_norm": 0.18879717588424683, | |
| "kl": 0.000827789306640625, | |
| "learning_rate": 9.78935800506826e-07, | |
| "loss": 0.0, | |
| "reward": -0.015797210857272148, | |
| "reward_std": 0.735307015478611, | |
| "rewards/cosine_scaled_reward": -0.13289860635995865, | |
| "rewards/format_reward": 0.2500000037252903, | |
| "step": 94 | |
| }, | |
| { | |
| "completion_length": 3198.1459350585938, | |
| "epoch": 0.10857142857142857, | |
| "grad_norm": 0.15773996710777283, | |
| "kl": 0.00040841102600097656, | |
| "learning_rate": 9.779754323328192e-07, | |
| "loss": 0.0, | |
| "reward": -0.016605263575911522, | |
| "reward_std": 0.7409057542681694, | |
| "rewards/cosine_scaled_reward": -0.16455264016985893, | |
| "rewards/format_reward": 0.3125000037252903, | |
| "step": 95 | |
| }, | |
| { | |
| "completion_length": 2382.2084350585938, | |
| "epoch": 0.10971428571428571, | |
| "grad_norm": 0.2195434868335724, | |
| "kl": 0.0015625953674316406, | |
| "learning_rate": 9.769942052400235e-07, | |
| "loss": 0.0001, | |
| "reward": 0.3714839336462319, | |
| "reward_std": 0.5286353975534439, | |
| "rewards/cosine_scaled_reward": -0.08509137481451035, | |
| "rewards/format_reward": 0.5416666865348816, | |
| "step": 96 | |
| }, | |
| { | |
| "completion_length": 2969.479248046875, | |
| "epoch": 0.11085714285714286, | |
| "grad_norm": 0.22521458566188812, | |
| "kl": 0.0011968612670898438, | |
| "learning_rate": 9.759921670520634e-07, | |
| "loss": 0.0, | |
| "reward": 0.023297425359487534, | |
| "reward_std": 0.6409126222133636, | |
| "rewards/cosine_scaled_reward": -0.14460130035877228, | |
| "rewards/format_reward": 0.3125000111758709, | |
| "step": 97 | |
| }, | |
| { | |
| "completion_length": 2782.479248046875, | |
| "epoch": 0.112, | |
| "grad_norm": 0.48463404178619385, | |
| "kl": 0.01540231704711914, | |
| "learning_rate": 9.749693666068663e-07, | |
| "loss": 0.0006, | |
| "reward": 0.2789543569087982, | |
| "reward_std": 0.6075774282217026, | |
| "rewards/cosine_scaled_reward": -0.05843949131667614, | |
| "rewards/format_reward": 0.39583333395421505, | |
| "step": 98 | |
| }, | |
| { | |
| "completion_length": 2909.1458435058594, | |
| "epoch": 0.11314285714285714, | |
| "grad_norm": 0.19729964435100555, | |
| "kl": 0.000751495361328125, | |
| "learning_rate": 9.739258537542835e-07, | |
| "loss": 0.0, | |
| "reward": 0.3117349073290825, | |
| "reward_std": 0.5036360248923302, | |
| "rewards/cosine_scaled_reward": -0.010799217969179153, | |
| "rewards/format_reward": 0.33333333395421505, | |
| "step": 99 | |
| }, | |
| { | |
| "completion_length": 3006.9375610351562, | |
| "epoch": 0.11428571428571428, | |
| "grad_norm": 0.1744341254234314, | |
| "kl": 0.0009255409240722656, | |
| "learning_rate": 9.728616793536587e-07, | |
| "loss": 0.0, | |
| "reward": 0.4609271613880992, | |
| "reward_std": 0.859523817896843, | |
| "rewards/cosine_scaled_reward": 0.022130253724753857, | |
| "rewards/format_reward": 0.416666679084301, | |
| "step": 100 | |
| }, | |
| { | |
| "completion_length": 2650.979217529297, | |
| "epoch": 0.11542857142857142, | |
| "grad_norm": 0.18743358552455902, | |
| "kl": 0.0010256767272949219, | |
| "learning_rate": 9.717768952713511e-07, | |
| "loss": 0.0, | |
| "reward": 0.21766437217593193, | |
| "reward_std": 0.6801646202802658, | |
| "rewards/cosine_scaled_reward": -0.09950115904211998, | |
| "rewards/format_reward": 0.41666667722165585, | |
| "step": 101 | |
| }, | |
| { | |
| "completion_length": 2554.1875610351562, | |
| "epoch": 0.11657142857142858, | |
| "grad_norm": 0.18099477887153625, | |
| "kl": 0.001209259033203125, | |
| "learning_rate": 9.706715543782064e-07, | |
| "loss": 0.0, | |
| "reward": 0.29797927755862474, | |
| "reward_std": 0.4223637208342552, | |
| "rewards/cosine_scaled_reward": -0.10101036727428436, | |
| "rewards/format_reward": 0.5, | |
| "step": 102 | |
| }, | |
| { | |
| "completion_length": 2658.979248046875, | |
| "epoch": 0.11771428571428572, | |
| "grad_norm": 0.15931963920593262, | |
| "kl": 0.0010652542114257812, | |
| "learning_rate": 9.695457105469804e-07, | |
| "loss": 0.0, | |
| "reward": 0.23173093050718307, | |
| "reward_std": 0.6561538353562355, | |
| "rewards/cosine_scaled_reward": -0.10288454219698906, | |
| "rewards/format_reward": 0.4375, | |
| "step": 103 | |
| }, | |
| { | |
| "completion_length": 2505.5833740234375, | |
| "epoch": 0.11885714285714286, | |
| "grad_norm": 0.18099236488342285, | |
| "kl": 0.0005369186401367188, | |
| "learning_rate": 9.683994186497132e-07, | |
| "loss": 0.0, | |
| "reward": 0.9495851993560791, | |
| "reward_std": 0.7366478592157364, | |
| "rewards/cosine_scaled_reward": 0.19354257080703974, | |
| "rewards/format_reward": 0.5625000149011612, | |
| "step": 104 | |
| }, | |
| { | |
| "completion_length": 2778.3125, | |
| "epoch": 0.12, | |
| "grad_norm": 0.20704622566699982, | |
| "kl": 0.0013036727905273438, | |
| "learning_rate": 9.672327345550543e-07, | |
| "loss": 0.0001, | |
| "reward": 0.24978191778063774, | |
| "reward_std": 0.7690765783190727, | |
| "rewards/cosine_scaled_reward": -0.09385904669761658, | |
| "rewards/format_reward": 0.4375000074505806, | |
| "step": 105 | |
| }, | |
| { | |
| "completion_length": 2914.6250610351562, | |
| "epoch": 0.12114285714285715, | |
| "grad_norm": 0.1921072155237198, | |
| "kl": 0.001560211181640625, | |
| "learning_rate": 9.66045715125541e-07, | |
| "loss": 0.0001, | |
| "reward": 0.3622821723110974, | |
| "reward_std": 0.912909746170044, | |
| "rewards/cosine_scaled_reward": -0.01677557732909918, | |
| "rewards/format_reward": 0.3958333395421505, | |
| "step": 106 | |
| }, | |
| { | |
| "completion_length": 2109.354248046875, | |
| "epoch": 0.12228571428571429, | |
| "grad_norm": 0.2092333436012268, | |
| "kl": 0.0012311935424804688, | |
| "learning_rate": 9.648384182148252e-07, | |
| "loss": 0.0, | |
| "reward": 0.5422526616603136, | |
| "reward_std": 0.8620414137840271, | |
| "rewards/cosine_scaled_reward": -0.062207008711993694, | |
| "rewards/format_reward": 0.6666666865348816, | |
| "step": 107 | |
| }, | |
| { | |
| "completion_length": 2214.9583587646484, | |
| "epoch": 0.12342857142857143, | |
| "grad_norm": 0.2060472071170807, | |
| "kl": 0.0030279159545898438, | |
| "learning_rate": 9.636109026648554e-07, | |
| "loss": 0.0001, | |
| "reward": 0.6110497042536736, | |
| "reward_std": 0.7519760131835938, | |
| "rewards/cosine_scaled_reward": 0.03469152469187975, | |
| "rewards/format_reward": 0.5416666716337204, | |
| "step": 108 | |
| }, | |
| { | |
| "completion_length": 2803.2083740234375, | |
| "epoch": 0.12457142857142857, | |
| "grad_norm": 0.20012950897216797, | |
| "kl": 0.0010724067687988281, | |
| "learning_rate": 9.623632283030077e-07, | |
| "loss": 0.0, | |
| "reward": 0.08136957883834839, | |
| "reward_std": 0.6037605702877045, | |
| "rewards/cosine_scaled_reward": -0.15723188465926796, | |
| "rewards/format_reward": 0.3958333395421505, | |
| "step": 109 | |
| }, | |
| { | |
| "completion_length": 2464.7500610351562, | |
| "epoch": 0.12571428571428572, | |
| "grad_norm": 0.22272928059101105, | |
| "kl": 0.0024871826171875, | |
| "learning_rate": 9.610954559391704e-07, | |
| "loss": 0.0001, | |
| "reward": 0.9461969807744026, | |
| "reward_std": 0.842521145939827, | |
| "rewards/cosine_scaled_reward": 0.1710151496808976, | |
| "rewards/format_reward": 0.604166679084301, | |
| "step": 110 | |
| }, | |
| { | |
| "completion_length": 1819.3750610351562, | |
| "epoch": 0.12685714285714286, | |
| "grad_norm": 0.24292264878749847, | |
| "kl": 0.0017080307006835938, | |
| "learning_rate": 9.598076473627796e-07, | |
| "loss": 0.0001, | |
| "reward": 0.7363898158073425, | |
| "reward_std": 0.7160477414727211, | |
| "rewards/cosine_scaled_reward": 0.003611571155488491, | |
| "rewards/format_reward": 0.7291666716337204, | |
| "step": 111 | |
| }, | |
| { | |
| "completion_length": 2938.8333740234375, | |
| "epoch": 0.128, | |
| "grad_norm": 0.20110748708248138, | |
| "kl": 0.0015575885772705078, | |
| "learning_rate": 9.58499865339809e-07, | |
| "loss": 0.0001, | |
| "reward": 0.47850653529167175, | |
| "reward_std": 0.8684659749269485, | |
| "rewards/cosine_scaled_reward": 0.020503249019384384, | |
| "rewards/format_reward": 0.4375000149011612, | |
| "step": 112 | |
| }, | |
| { | |
| "completion_length": 2170.791717529297, | |
| "epoch": 0.12914285714285714, | |
| "grad_norm": 0.22352100908756256, | |
| "kl": 0.0018672943115234375, | |
| "learning_rate": 9.571721736097088e-07, | |
| "loss": 0.0001, | |
| "reward": 0.8810203373432159, | |
| "reward_std": 0.6223750859498978, | |
| "rewards/cosine_scaled_reward": 0.09676016308367252, | |
| "rewards/format_reward": 0.6875000074505806, | |
| "step": 113 | |
| }, | |
| { | |
| "completion_length": 1761.1250457763672, | |
| "epoch": 0.13028571428571428, | |
| "grad_norm": 0.22250227630138397, | |
| "kl": 0.0011968612670898438, | |
| "learning_rate": 9.55824636882301e-07, | |
| "loss": 0.0, | |
| "reward": 0.8257871624082327, | |
| "reward_std": 0.5129944495856762, | |
| "rewards/cosine_scaled_reward": 0.08997690677642822, | |
| "rewards/format_reward": 0.6458333432674408, | |
| "step": 114 | |
| }, | |
| { | |
| "completion_length": 2406.6875610351562, | |
| "epoch": 0.13142857142857142, | |
| "grad_norm": 0.18063588440418243, | |
| "kl": 0.0014677047729492188, | |
| "learning_rate": 9.54457320834625e-07, | |
| "loss": 0.0001, | |
| "reward": 0.7925823777914047, | |
| "reward_std": 1.05247762799263, | |
| "rewards/cosine_scaled_reward": 0.08379119075834751, | |
| "rewards/format_reward": 0.6250000149011612, | |
| "step": 115 | |
| }, | |
| { | |
| "completion_length": 2417.8958435058594, | |
| "epoch": 0.13257142857142856, | |
| "grad_norm": 0.23391500115394592, | |
| "kl": 0.004772186279296875, | |
| "learning_rate": 9.530702921077358e-07, | |
| "loss": 0.0002, | |
| "reward": 0.15583854354918003, | |
| "reward_std": 0.6483651623129845, | |
| "rewards/cosine_scaled_reward": -0.14083073096117005, | |
| "rewards/format_reward": 0.4375, | |
| "step": 116 | |
| }, | |
| { | |
| "completion_length": 3263.666748046875, | |
| "epoch": 0.1337142857142857, | |
| "grad_norm": 0.1551298350095749, | |
| "kl": 0.0015048980712890625, | |
| "learning_rate": 9.516636183034564e-07, | |
| "loss": 0.0001, | |
| "reward": -0.017866918817162514, | |
| "reward_std": 0.6443519741296768, | |
| "rewards/cosine_scaled_reward": -0.10268345987424254, | |
| "rewards/format_reward": 0.18750000558793545, | |
| "step": 117 | |
| }, | |
| { | |
| "completion_length": 2010.8750305175781, | |
| "epoch": 0.13485714285714287, | |
| "grad_norm": 0.21352525055408478, | |
| "kl": 0.001308441162109375, | |
| "learning_rate": 9.502373679810839e-07, | |
| "loss": 0.0001, | |
| "reward": 0.8165969103574753, | |
| "reward_std": 0.8080126643180847, | |
| "rewards/cosine_scaled_reward": 0.012465095147490501, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 118 | |
| }, | |
| { | |
| "completion_length": 1884.0625610351562, | |
| "epoch": 0.136, | |
| "grad_norm": 0.27621325850486755, | |
| "kl": 0.0025424957275390625, | |
| "learning_rate": 9.487916106540465e-07, | |
| "loss": 0.0001, | |
| "reward": 0.5952838063240051, | |
| "reward_std": 0.5625797361135483, | |
| "rewards/cosine_scaled_reward": -0.046108097303658724, | |
| "rewards/format_reward": 0.6875, | |
| "step": 119 | |
| }, | |
| { | |
| "completion_length": 2755.3959350585938, | |
| "epoch": 0.13714285714285715, | |
| "grad_norm": 0.23236262798309326, | |
| "kl": 0.0017757415771484375, | |
| "learning_rate": 9.473264167865171e-07, | |
| "loss": 0.0001, | |
| "reward": 0.063610197044909, | |
| "reward_std": 0.8038829490542412, | |
| "rewards/cosine_scaled_reward": -0.18694491172209382, | |
| "rewards/format_reward": 0.4375000074505806, | |
| "step": 120 | |
| }, | |
| { | |
| "completion_length": 2782.5833740234375, | |
| "epoch": 0.1382857142857143, | |
| "grad_norm": 0.186203733086586, | |
| "kl": 0.0014486312866210938, | |
| "learning_rate": 9.458418577899774e-07, | |
| "loss": 0.0001, | |
| "reward": 0.2508644163608551, | |
| "reward_std": 0.5808881223201752, | |
| "rewards/cosine_scaled_reward": -0.09331781789660454, | |
| "rewards/format_reward": 0.4375000111758709, | |
| "step": 121 | |
| }, | |
| { | |
| "completion_length": 2584.0625915527344, | |
| "epoch": 0.13942857142857143, | |
| "grad_norm": 0.2748485803604126, | |
| "kl": 0.0027008056640625, | |
| "learning_rate": 9.443380060197385e-07, | |
| "loss": 0.0001, | |
| "reward": 0.24610598012804985, | |
| "reward_std": 0.4119979292154312, | |
| "rewards/cosine_scaled_reward": -0.11653035134077072, | |
| "rewards/format_reward": 0.47916667722165585, | |
| "step": 122 | |
| }, | |
| { | |
| "completion_length": 2821.416748046875, | |
| "epoch": 0.14057142857142857, | |
| "grad_norm": 0.27934524416923523, | |
| "kl": 0.0057964324951171875, | |
| "learning_rate": 9.428149347714143e-07, | |
| "loss": 0.0002, | |
| "reward": 0.10106497257947922, | |
| "reward_std": 0.7212768346071243, | |
| "rewards/cosine_scaled_reward": -0.10571751650422812, | |
| "rewards/format_reward": 0.31250000186264515, | |
| "step": 123 | |
| }, | |
| { | |
| "completion_length": 2463.0000915527344, | |
| "epoch": 0.1417142857142857, | |
| "grad_norm": 0.22744229435920715, | |
| "kl": 0.0014429092407226562, | |
| "learning_rate": 9.412727182773486e-07, | |
| "loss": 0.0001, | |
| "reward": 0.8311970978975296, | |
| "reward_std": 0.8409435376524925, | |
| "rewards/cosine_scaled_reward": 0.11351519823074341, | |
| "rewards/format_reward": 0.6041666716337204, | |
| "step": 124 | |
| }, | |
| { | |
| "completion_length": 3028.6459350585938, | |
| "epoch": 0.14285714285714285, | |
| "grad_norm": 0.230963334441185, | |
| "kl": 0.0016689300537109375, | |
| "learning_rate": 9.397114317029974e-07, | |
| "loss": 0.0001, | |
| "reward": 0.08886492438614368, | |
| "reward_std": 0.5733988359570503, | |
| "rewards/cosine_scaled_reward": -0.13265088573098183, | |
| "rewards/format_reward": 0.35416666977107525, | |
| "step": 125 | |
| }, | |
| { | |
| "completion_length": 1943.6667175292969, | |
| "epoch": 0.144, | |
| "grad_norm": 0.26326608657836914, | |
| "kl": 0.0029306411743164062, | |
| "learning_rate": 9.381311511432658e-07, | |
| "loss": 0.0001, | |
| "reward": 0.6195674315094948, | |
| "reward_std": 0.7148094028234482, | |
| "rewards/cosine_scaled_reward": -0.03396627772599459, | |
| "rewards/format_reward": 0.6875000149011612, | |
| "step": 126 | |
| }, | |
| { | |
| "completion_length": 2635.9166870117188, | |
| "epoch": 0.14514285714285713, | |
| "grad_norm": 0.2009022980928421, | |
| "kl": 0.0012750625610351562, | |
| "learning_rate": 9.36531953618799e-07, | |
| "loss": 0.0001, | |
| "reward": 0.41267674416303635, | |
| "reward_std": 0.8958253264427185, | |
| "rewards/cosine_scaled_reward": -0.022828295826911926, | |
| "rewards/format_reward": 0.4583333395421505, | |
| "step": 127 | |
| }, | |
| { | |
| "completion_length": 2321.2500610351562, | |
| "epoch": 0.1462857142857143, | |
| "grad_norm": 0.19144511222839355, | |
| "kl": 0.00170135498046875, | |
| "learning_rate": 9.34913917072228e-07, | |
| "loss": 0.0001, | |
| "reward": 0.5039072521030903, | |
| "reward_std": 0.8606824576854706, | |
| "rewards/cosine_scaled_reward": -0.029296381399035454, | |
| "rewards/format_reward": 0.5625000149011612, | |
| "step": 128 | |
| }, | |
| { | |
| "completion_length": 1837.7917175292969, | |
| "epoch": 0.14742857142857144, | |
| "grad_norm": 0.22352828085422516, | |
| "kl": 0.0069904327392578125, | |
| "learning_rate": 9.332771203643714e-07, | |
| "loss": 0.0003, | |
| "reward": 0.5292131304740906, | |
| "reward_std": 0.6835447549819946, | |
| "rewards/cosine_scaled_reward": -0.09997677942737937, | |
| "rewards/format_reward": 0.7291666716337204, | |
| "step": 129 | |
| }, | |
| { | |
| "completion_length": 2737.7918090820312, | |
| "epoch": 0.14857142857142858, | |
| "grad_norm": 0.21162550151348114, | |
| "kl": 0.00159454345703125, | |
| "learning_rate": 9.316216432703916e-07, | |
| "loss": 0.0001, | |
| "reward": 0.1000329963862896, | |
| "reward_std": 0.6349897980690002, | |
| "rewards/cosine_scaled_reward": -0.15831685485318303, | |
| "rewards/format_reward": 0.416666679084301, | |
| "step": 130 | |
| }, | |
| { | |
| "completion_length": 2639.3750610351562, | |
| "epoch": 0.14971428571428572, | |
| "grad_norm": 0.21849536895751953, | |
| "kl": 0.001689910888671875, | |
| "learning_rate": 9.299475664759068e-07, | |
| "loss": 0.0001, | |
| "reward": 0.3575108479708433, | |
| "reward_std": 0.7335042506456375, | |
| "rewards/cosine_scaled_reward": -0.07124457694590092, | |
| "rewards/format_reward": 0.5000000055879354, | |
| "step": 131 | |
| }, | |
| { | |
| "completion_length": 2595.791748046875, | |
| "epoch": 0.15085714285714286, | |
| "grad_norm": 0.2819630801677704, | |
| "kl": 0.0028476715087890625, | |
| "learning_rate": 9.282549715730579e-07, | |
| "loss": 0.0001, | |
| "reward": -0.14514993596822023, | |
| "reward_std": 0.4842909276485443, | |
| "rewards/cosine_scaled_reward": -0.2704916410148144, | |
| "rewards/format_reward": 0.3958333432674408, | |
| "step": 132 | |
| }, | |
| { | |
| "completion_length": 2752.2084350585938, | |
| "epoch": 0.152, | |
| "grad_norm": 0.20234017074108124, | |
| "kl": 0.002986907958984375, | |
| "learning_rate": 9.265439410565328e-07, | |
| "loss": 0.0001, | |
| "reward": 0.15741661936044693, | |
| "reward_std": 0.6936222016811371, | |
| "rewards/cosine_scaled_reward": -0.15045835822820663, | |
| "rewards/format_reward": 0.4583333507180214, | |
| "step": 133 | |
| }, | |
| { | |
| "completion_length": 1920.0000610351562, | |
| "epoch": 0.15314285714285714, | |
| "grad_norm": 0.30035078525543213, | |
| "kl": 0.00372314453125, | |
| "learning_rate": 9.248145583195447e-07, | |
| "loss": 0.0001, | |
| "reward": 0.7273320555686951, | |
| "reward_std": 0.7046244740486145, | |
| "rewards/cosine_scaled_reward": 0.01991601102054119, | |
| "rewards/format_reward": 0.6875, | |
| "step": 134 | |
| }, | |
| { | |
| "completion_length": 2167.8334350585938, | |
| "epoch": 0.15428571428571428, | |
| "grad_norm": 0.2100658118724823, | |
| "kl": 0.0017566680908203125, | |
| "learning_rate": 9.230669076497687e-07, | |
| "loss": 0.0001, | |
| "reward": 0.4464118145406246, | |
| "reward_std": 0.3928603231906891, | |
| "rewards/cosine_scaled_reward": -0.03721076436340809, | |
| "rewards/format_reward": 0.520833333954215, | |
| "step": 135 | |
| }, | |
| { | |
| "completion_length": 2357.2084350585938, | |
| "epoch": 0.15542857142857142, | |
| "grad_norm": 0.24143747985363007, | |
| "kl": 0.00250244140625, | |
| "learning_rate": 9.213010742252327e-07, | |
| "loss": 0.0001, | |
| "reward": 0.6343938559293747, | |
| "reward_std": 0.7614049315452576, | |
| "rewards/cosine_scaled_reward": 0.004696924239397049, | |
| "rewards/format_reward": 0.6250000149011612, | |
| "step": 136 | |
| }, | |
| { | |
| "completion_length": 2482.1875610351562, | |
| "epoch": 0.15657142857142858, | |
| "grad_norm": 0.22769631445407867, | |
| "kl": 0.003353118896484375, | |
| "learning_rate": 9.195171441101668e-07, | |
| "loss": 0.0001, | |
| "reward": 0.09337181597948074, | |
| "reward_std": 0.6429625153541565, | |
| "rewards/cosine_scaled_reward": -0.18248076736927032, | |
| "rewards/format_reward": 0.4583333395421505, | |
| "step": 137 | |
| }, | |
| { | |
| "completion_length": 2043.5000610351562, | |
| "epoch": 0.15771428571428572, | |
| "grad_norm": 0.2516387403011322, | |
| "kl": 0.003204345703125, | |
| "learning_rate": 9.177152042508077e-07, | |
| "loss": 0.0001, | |
| "reward": 0.9434101283550262, | |
| "reward_std": 0.8629068434238434, | |
| "rewards/cosine_scaled_reward": 0.13837172836065292, | |
| "rewards/format_reward": 0.6666666865348816, | |
| "step": 138 | |
| }, | |
| { | |
| "completion_length": 2047.5625, | |
| "epoch": 0.15885714285714286, | |
| "grad_norm": 0.2453654259443283, | |
| "kl": 0.0018482208251953125, | |
| "learning_rate": 9.158953424711624e-07, | |
| "loss": 0.0001, | |
| "reward": 0.7086100317537785, | |
| "reward_std": 0.5908297449350357, | |
| "rewards/cosine_scaled_reward": 0.00013833213597536087, | |
| "rewards/format_reward": 0.7083333507180214, | |
| "step": 139 | |
| }, | |
| { | |
| "completion_length": 2375.7084350585938, | |
| "epoch": 0.16, | |
| "grad_norm": 0.1979781836271286, | |
| "kl": 0.0024309158325195312, | |
| "learning_rate": 9.140576474687263e-07, | |
| "loss": 0.0001, | |
| "reward": 0.619663898833096, | |
| "reward_std": 0.41604873538017273, | |
| "rewards/cosine_scaled_reward": 0.007748600095510483, | |
| "rewards/format_reward": 0.6041666716337204, | |
| "step": 140 | |
| }, | |
| { | |
| "completion_length": 2381.7708740234375, | |
| "epoch": 0.16114285714285714, | |
| "grad_norm": 0.23859204351902008, | |
| "kl": 0.003826141357421875, | |
| "learning_rate": 9.122022088101613e-07, | |
| "loss": 0.0002, | |
| "reward": 0.3829444032162428, | |
| "reward_std": 0.5856426432728767, | |
| "rewards/cosine_scaled_reward": -0.05852780118584633, | |
| "rewards/format_reward": 0.5000000149011612, | |
| "step": 141 | |
| }, | |
| { | |
| "completion_length": 2265.4584045410156, | |
| "epoch": 0.16228571428571428, | |
| "grad_norm": 0.26838216185569763, | |
| "kl": 0.0041828155517578125, | |
| "learning_rate": 9.103291169269299e-07, | |
| "loss": 0.0002, | |
| "reward": 0.459966566413641, | |
| "reward_std": 0.5846913754940033, | |
| "rewards/cosine_scaled_reward": -0.030433382838964462, | |
| "rewards/format_reward": 0.5208333358168602, | |
| "step": 142 | |
| }, | |
| { | |
| "completion_length": 2539.479248046875, | |
| "epoch": 0.16342857142857142, | |
| "grad_norm": 0.18913578987121582, | |
| "kl": 0.0029649734497070312, | |
| "learning_rate": 9.084384631108882e-07, | |
| "loss": 0.0001, | |
| "reward": 0.42852520011365414, | |
| "reward_std": 0.6579816788434982, | |
| "rewards/cosine_scaled_reward": -0.05657072924077511, | |
| "rewards/format_reward": 0.5416666865348816, | |
| "step": 143 | |
| }, | |
| { | |
| "completion_length": 2627.8125610351562, | |
| "epoch": 0.16457142857142856, | |
| "grad_norm": 0.1790352761745453, | |
| "kl": 0.0033435821533203125, | |
| "learning_rate": 9.065303395098358e-07, | |
| "loss": 0.0001, | |
| "reward": 0.7180662602186203, | |
| "reward_std": 0.9851722121238708, | |
| "rewards/cosine_scaled_reward": 0.05694979056715965, | |
| "rewards/format_reward": 0.6041666939854622, | |
| "step": 144 | |
| }, | |
| { | |
| "completion_length": 2526.541778564453, | |
| "epoch": 0.1657142857142857, | |
| "grad_norm": 0.22108972072601318, | |
| "kl": 0.0050811767578125, | |
| "learning_rate": 9.046048391230247e-07, | |
| "loss": 0.0002, | |
| "reward": 0.40608268324285746, | |
| "reward_std": 0.8329223841428757, | |
| "rewards/cosine_scaled_reward": -0.0573753397911787, | |
| "rewards/format_reward": 0.5208333432674408, | |
| "step": 145 | |
| }, | |
| { | |
| "completion_length": 2217.0625610351562, | |
| "epoch": 0.16685714285714287, | |
| "grad_norm": 0.37226402759552, | |
| "kl": 0.00292205810546875, | |
| "learning_rate": 9.026620557966279e-07, | |
| "loss": 0.0001, | |
| "reward": 0.29790709912776947, | |
| "reward_std": 0.6913768872618675, | |
| "rewards/cosine_scaled_reward": -0.19479646161198616, | |
| "rewards/format_reward": 0.6875000149011612, | |
| "step": 146 | |
| }, | |
| { | |
| "completion_length": 2100.500030517578, | |
| "epoch": 0.168, | |
| "grad_norm": 0.1791330724954605, | |
| "kl": 0.0022411346435546875, | |
| "learning_rate": 9.007020842191634e-07, | |
| "loss": 0.0001, | |
| "reward": 0.6166809126734734, | |
| "reward_std": 0.7499666661024094, | |
| "rewards/cosine_scaled_reward": -0.014576207846403122, | |
| "rewards/format_reward": 0.6458333432674408, | |
| "step": 147 | |
| }, | |
| { | |
| "completion_length": 1953.1667022705078, | |
| "epoch": 0.16914285714285715, | |
| "grad_norm": 0.26837047934532166, | |
| "kl": 0.0032806396484375, | |
| "learning_rate": 8.987250199168808e-07, | |
| "loss": 0.0001, | |
| "reward": 0.22773092985153198, | |
| "reward_std": 0.5684618726372719, | |
| "rewards/cosine_scaled_reward": -0.21946788486093283, | |
| "rewards/format_reward": 0.6666666679084301, | |
| "step": 148 | |
| }, | |
| { | |
| "completion_length": 2537.8958740234375, | |
| "epoch": 0.1702857142857143, | |
| "grad_norm": 0.19118516147136688, | |
| "kl": 0.0032138824462890625, | |
| "learning_rate": 8.967309592491052e-07, | |
| "loss": 0.0001, | |
| "reward": 0.9459018707275391, | |
| "reward_std": 0.6409400217235088, | |
| "rewards/cosine_scaled_reward": 0.18128425255417824, | |
| "rewards/format_reward": 0.5833333488553762, | |
| "step": 149 | |
| }, | |
| { | |
| "completion_length": 2327.9375610351562, | |
| "epoch": 0.17142857142857143, | |
| "grad_norm": 0.22041891515254974, | |
| "kl": 0.004886627197265625, | |
| "learning_rate": 8.9471999940354e-07, | |
| "loss": 0.0002, | |
| "reward": 0.2033998966217041, | |
| "reward_std": 0.6696746721863747, | |
| "rewards/cosine_scaled_reward": -0.1587167321704328, | |
| "rewards/format_reward": 0.5208333488553762, | |
| "step": 150 | |
| }, | |
| { | |
| "completion_length": 2248.9375610351562, | |
| "epoch": 0.17257142857142857, | |
| "grad_norm": 0.20176808536052704, | |
| "kl": 0.0022144317626953125, | |
| "learning_rate": 8.926922383915315e-07, | |
| "loss": 0.0001, | |
| "reward": 0.2744840234518051, | |
| "reward_std": 0.46313488483428955, | |
| "rewards/cosine_scaled_reward": -0.11275799572467804, | |
| "rewards/format_reward": 0.5000000149011612, | |
| "step": 151 | |
| }, | |
| { | |
| "completion_length": 2933.0833740234375, | |
| "epoch": 0.1737142857142857, | |
| "grad_norm": 0.26598456501960754, | |
| "kl": 0.005481719970703125, | |
| "learning_rate": 8.906477750432903e-07, | |
| "loss": 0.0002, | |
| "reward": 0.04549443535506725, | |
| "reward_std": 0.48727741837501526, | |
| "rewards/cosine_scaled_reward": -0.13350277952849865, | |
| "rewards/format_reward": 0.3125000074505806, | |
| "step": 152 | |
| }, | |
| { | |
| "completion_length": 2075.729278564453, | |
| "epoch": 0.17485714285714285, | |
| "grad_norm": 0.19798634946346283, | |
| "kl": 0.0023260116577148438, | |
| "learning_rate": 8.88586709003076e-07, | |
| "loss": 0.0001, | |
| "reward": 0.9074295610189438, | |
| "reward_std": 0.5949664637446404, | |
| "rewards/cosine_scaled_reward": 0.09954808466136456, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 153 | |
| }, | |
| { | |
| "completion_length": 2754.791748046875, | |
| "epoch": 0.176, | |
| "grad_norm": 0.19806884229183197, | |
| "kl": 0.0027751922607421875, | |
| "learning_rate": 8.865091407243394e-07, | |
| "loss": 0.0001, | |
| "reward": 0.6114984676241875, | |
| "reward_std": 0.6458263993263245, | |
| "rewards/cosine_scaled_reward": 0.08699923381209373, | |
| "rewards/format_reward": 0.4375000149011612, | |
| "step": 154 | |
| }, | |
| { | |
| "completion_length": 3066.354248046875, | |
| "epoch": 0.17714285714285713, | |
| "grad_norm": 0.29352447390556335, | |
| "kl": 0.00366973876953125, | |
| "learning_rate": 8.844151714648274e-07, | |
| "loss": 0.0001, | |
| "reward": 0.017834719270467758, | |
| "reward_std": 0.8173775672912598, | |
| "rewards/cosine_scaled_reward": -0.14733264222741127, | |
| "rewards/format_reward": 0.31250000558793545, | |
| "step": 155 | |
| }, | |
| { | |
| "completion_length": 2754.4375610351562, | |
| "epoch": 0.1782857142857143, | |
| "grad_norm": 0.19924919307231903, | |
| "kl": 0.005084991455078125, | |
| "learning_rate": 8.823049032816478e-07, | |
| "loss": 0.0002, | |
| "reward": 0.06147514842450619, | |
| "reward_std": 0.5927468463778496, | |
| "rewards/cosine_scaled_reward": -0.14634575322270393, | |
| "rewards/format_reward": 0.3541666828095913, | |
| "step": 156 | |
| }, | |
| { | |
| "completion_length": 2206.250030517578, | |
| "epoch": 0.17942857142857144, | |
| "grad_norm": 0.23515692353248596, | |
| "kl": 0.0025005340576171875, | |
| "learning_rate": 8.801784390262943e-07, | |
| "loss": 0.0001, | |
| "reward": 0.9370372518897057, | |
| "reward_std": 0.8283505141735077, | |
| "rewards/cosine_scaled_reward": 0.1456019375473261, | |
| "rewards/format_reward": 0.6458333507180214, | |
| "step": 157 | |
| }, | |
| { | |
| "completion_length": 2862.7708740234375, | |
| "epoch": 0.18057142857142858, | |
| "grad_norm": 0.19533918797969818, | |
| "kl": 0.004573822021484375, | |
| "learning_rate": 8.780358823396352e-07, | |
| "loss": 0.0002, | |
| "reward": -0.053055196069180965, | |
| "reward_std": 0.6198497340083122, | |
| "rewards/cosine_scaled_reward": -0.23486093431711197, | |
| "rewards/format_reward": 0.41666667349636555, | |
| "step": 158 | |
| }, | |
| { | |
| "completion_length": 1894.8542175292969, | |
| "epoch": 0.18171428571428572, | |
| "grad_norm": 0.22211147844791412, | |
| "kl": 0.002826690673828125, | |
| "learning_rate": 8.758773376468604e-07, | |
| "loss": 0.0001, | |
| "reward": 1.2009564340114594, | |
| "reward_std": 0.7813936918973923, | |
| "rewards/cosine_scaled_reward": 0.1942282197996974, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 159 | |
| }, | |
| { | |
| "completion_length": 1964.9792175292969, | |
| "epoch": 0.18285714285714286, | |
| "grad_norm": 0.21944580972194672, | |
| "kl": 0.003116607666015625, | |
| "learning_rate": 8.737029101523929e-07, | |
| "loss": 0.0001, | |
| "reward": 0.9964812844991684, | |
| "reward_std": 0.7849611788988113, | |
| "rewards/cosine_scaled_reward": 0.10240732878446579, | |
| "rewards/format_reward": 0.7916666716337204, | |
| "step": 160 | |
| }, | |
| { | |
| "completion_length": 2853.4583740234375, | |
| "epoch": 0.184, | |
| "grad_norm": 0.1943131983280182, | |
| "kl": 0.00357818603515625, | |
| "learning_rate": 8.715127058347614e-07, | |
| "loss": 0.0001, | |
| "reward": 0.10683573782444, | |
| "reward_std": 0.6206659823656082, | |
| "rewards/cosine_scaled_reward": -0.1549154706299305, | |
| "rewards/format_reward": 0.41666667722165585, | |
| "step": 161 | |
| }, | |
| { | |
| "completion_length": 2725.604248046875, | |
| "epoch": 0.18514285714285714, | |
| "grad_norm": 0.18736310303211212, | |
| "kl": 0.00328826904296875, | |
| "learning_rate": 8.693068314414344e-07, | |
| "loss": 0.0001, | |
| "reward": 0.16440774500370026, | |
| "reward_std": 0.7531605362892151, | |
| "rewards/cosine_scaled_reward": -0.1782128056511283, | |
| "rewards/format_reward": 0.5208333507180214, | |
| "step": 162 | |
| }, | |
| { | |
| "completion_length": 2378.604248046875, | |
| "epoch": 0.18628571428571428, | |
| "grad_norm": 0.27985262870788574, | |
| "kl": 0.00499725341796875, | |
| "learning_rate": 8.670853944836176e-07, | |
| "loss": 0.0002, | |
| "reward": 0.3438632491452154, | |
| "reward_std": 0.8545256406068802, | |
| "rewards/cosine_scaled_reward": -0.06765171512961388, | |
| "rewards/format_reward": 0.479166679084301, | |
| "step": 163 | |
| }, | |
| { | |
| "completion_length": 2041.791748046875, | |
| "epoch": 0.18742857142857142, | |
| "grad_norm": 0.2726307511329651, | |
| "kl": 0.00522613525390625, | |
| "learning_rate": 8.648485032310144e-07, | |
| "loss": 0.0002, | |
| "reward": 0.1412298008799553, | |
| "reward_std": 0.45817675441503525, | |
| "rewards/cosine_scaled_reward": -0.18980177072808146, | |
| "rewards/format_reward": 0.520833333954215, | |
| "step": 164 | |
| }, | |
| { | |
| "completion_length": 2056.8958740234375, | |
| "epoch": 0.18857142857142858, | |
| "grad_norm": 0.24950121343135834, | |
| "kl": 0.0033512115478515625, | |
| "learning_rate": 8.625962667065487e-07, | |
| "loss": 0.0001, | |
| "reward": 0.4896044433116913, | |
| "reward_std": 0.6808345168828964, | |
| "rewards/cosine_scaled_reward": -0.057281110901385546, | |
| "rewards/format_reward": 0.6041666828095913, | |
| "step": 165 | |
| }, | |
| { | |
| "completion_length": 1701.5625, | |
| "epoch": 0.18971428571428572, | |
| "grad_norm": 0.20629195868968964, | |
| "kl": 0.0029201507568359375, | |
| "learning_rate": 8.603287946810513e-07, | |
| "loss": 0.0001, | |
| "reward": 0.4639076357707381, | |
| "reward_std": 0.4746507927775383, | |
| "rewards/cosine_scaled_reward": -0.13262954354286194, | |
| "rewards/format_reward": 0.7291666716337204, | |
| "step": 166 | |
| }, | |
| { | |
| "completion_length": 2674.4375610351562, | |
| "epoch": 0.19085714285714286, | |
| "grad_norm": 0.19312238693237305, | |
| "kl": 0.004119873046875, | |
| "learning_rate": 8.580461976679099e-07, | |
| "loss": 0.0002, | |
| "reward": 0.26991652697324753, | |
| "reward_std": 0.8362310528755188, | |
| "rewards/cosine_scaled_reward": -0.11504174256697297, | |
| "rewards/format_reward": 0.5000000111758709, | |
| "step": 167 | |
| }, | |
| { | |
| "completion_length": 1723.5209045410156, | |
| "epoch": 0.192, | |
| "grad_norm": 0.19439440965652466, | |
| "kl": 0.002704620361328125, | |
| "learning_rate": 8.557485869176825e-07, | |
| "loss": 0.0001, | |
| "reward": 0.7088751941919327, | |
| "reward_std": 0.7652025148272514, | |
| "rewards/cosine_scaled_reward": -0.051812431775033474, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 168 | |
| }, | |
| { | |
| "completion_length": 2231.979248046875, | |
| "epoch": 0.19314285714285714, | |
| "grad_norm": 0.2904442250728607, | |
| "kl": 0.004784584045410156, | |
| "learning_rate": 8.534360744126753e-07, | |
| "loss": 0.0002, | |
| "reward": 0.9261031150817871, | |
| "reward_std": 0.9859992563724518, | |
| "rewards/cosine_scaled_reward": 0.12971824035048485, | |
| "rewards/format_reward": 0.6666666939854622, | |
| "step": 169 | |
| }, | |
| { | |
| "completion_length": 2114.0208740234375, | |
| "epoch": 0.19428571428571428, | |
| "grad_norm": 0.19766280055046082, | |
| "kl": 0.003971099853515625, | |
| "learning_rate": 8.511087728614862e-07, | |
| "loss": 0.0002, | |
| "reward": 0.3019937239587307, | |
| "reward_std": 0.6615323200821877, | |
| "rewards/cosine_scaled_reward": -0.1510864682495594, | |
| "rewards/format_reward": 0.6041666865348816, | |
| "step": 170 | |
| }, | |
| { | |
| "completion_length": 1614.7708435058594, | |
| "epoch": 0.19542857142857142, | |
| "grad_norm": 0.23038722574710846, | |
| "kl": 0.0032196044921875, | |
| "learning_rate": 8.487667956935087e-07, | |
| "loss": 0.0001, | |
| "reward": 0.5918858665972948, | |
| "reward_std": 0.47077811881899834, | |
| "rewards/cosine_scaled_reward": -0.09989039599895477, | |
| "rewards/format_reward": 0.7916666716337204, | |
| "step": 171 | |
| }, | |
| { | |
| "completion_length": 1724.5833740234375, | |
| "epoch": 0.19657142857142856, | |
| "grad_norm": 0.2515551447868347, | |
| "kl": 0.00432586669921875, | |
| "learning_rate": 8.464102570534061e-07, | |
| "loss": 0.0002, | |
| "reward": 0.8076295026112348, | |
| "reward_std": 0.5722271054983139, | |
| "rewards/cosine_scaled_reward": 0.04964808002114296, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 172 | |
| }, | |
| { | |
| "completion_length": 2433.4375610351562, | |
| "epoch": 0.1977142857142857, | |
| "grad_norm": 0.3010346591472626, | |
| "kl": 0.004909515380859375, | |
| "learning_rate": 8.440392717955475e-07, | |
| "loss": 0.0002, | |
| "reward": 0.34551432851003483, | |
| "reward_std": 0.7096427381038666, | |
| "rewards/cosine_scaled_reward": -0.08765951948589645, | |
| "rewards/format_reward": 0.5208333488553762, | |
| "step": 173 | |
| }, | |
| { | |
| "completion_length": 2419.979217529297, | |
| "epoch": 0.19885714285714284, | |
| "grad_norm": 0.19969363510608673, | |
| "kl": 0.005977630615234375, | |
| "learning_rate": 8.416539554784089e-07, | |
| "loss": 0.0002, | |
| "reward": 0.27888998575508595, | |
| "reward_std": 0.5231342613697052, | |
| "rewards/cosine_scaled_reward": -0.1001383513212204, | |
| "rewards/format_reward": 0.4791666716337204, | |
| "step": 174 | |
| }, | |
| { | |
| "completion_length": 2777.1250610351562, | |
| "epoch": 0.2, | |
| "grad_norm": 0.17995108664035797, | |
| "kl": 0.0071868896484375, | |
| "learning_rate": 8.392544243589427e-07, | |
| "loss": 0.0003, | |
| "reward": 0.6402685008943081, | |
| "reward_std": 0.7186409756541252, | |
| "rewards/cosine_scaled_reward": 0.0909675620496273, | |
| "rewards/format_reward": 0.4583333395421505, | |
| "step": 175 | |
| }, | |
| { | |
| "completion_length": 2093.25, | |
| "epoch": 0.20114285714285715, | |
| "grad_norm": 0.20400448143482208, | |
| "kl": 0.00519561767578125, | |
| "learning_rate": 8.368407953869103e-07, | |
| "loss": 0.0002, | |
| "reward": 0.5985848978161812, | |
| "reward_std": 0.7769260108470917, | |
| "rewards/cosine_scaled_reward": -0.03404088690876961, | |
| "rewards/format_reward": 0.6666666716337204, | |
| "step": 176 | |
| }, | |
| { | |
| "completion_length": 2018.1875915527344, | |
| "epoch": 0.2022857142857143, | |
| "grad_norm": 0.21771669387817383, | |
| "kl": 0.004276275634765625, | |
| "learning_rate": 8.344131861991828e-07, | |
| "loss": 0.0002, | |
| "reward": 0.5329161509871483, | |
| "reward_std": 0.5947398841381073, | |
| "rewards/cosine_scaled_reward": -0.0772919338196516, | |
| "rewards/format_reward": 0.6875000149011612, | |
| "step": 177 | |
| }, | |
| { | |
| "completion_length": 1553.2083892822266, | |
| "epoch": 0.20342857142857143, | |
| "grad_norm": 0.2806382477283478, | |
| "kl": 0.00424957275390625, | |
| "learning_rate": 8.319717151140072e-07, | |
| "loss": 0.0002, | |
| "reward": 0.9667995385825634, | |
| "reward_std": 0.4322159215807915, | |
| "rewards/cosine_scaled_reward": 0.09798309206962585, | |
| "rewards/format_reward": 0.770833358168602, | |
| "step": 178 | |
| }, | |
| { | |
| "completion_length": 2260.9584350585938, | |
| "epoch": 0.20457142857142857, | |
| "grad_norm": 0.19229480624198914, | |
| "kl": 0.00655364990234375, | |
| "learning_rate": 8.295165011252396e-07, | |
| "loss": 0.0003, | |
| "reward": 0.6450915709137917, | |
| "reward_std": 0.6193302199244499, | |
| "rewards/cosine_scaled_reward": 0.030879119411110878, | |
| "rewards/format_reward": 0.583333358168602, | |
| "step": 179 | |
| }, | |
| { | |
| "completion_length": 2204.104248046875, | |
| "epoch": 0.2057142857142857, | |
| "grad_norm": 0.34143543243408203, | |
| "kl": 0.00551605224609375, | |
| "learning_rate": 8.270476638965461e-07, | |
| "loss": 0.0002, | |
| "reward": 0.49652543663978577, | |
| "reward_std": 0.9920015186071396, | |
| "rewards/cosine_scaled_reward": -0.06423728261142969, | |
| "rewards/format_reward": 0.6250000111758709, | |
| "step": 180 | |
| }, | |
| { | |
| "completion_length": 2376.3333740234375, | |
| "epoch": 0.20685714285714285, | |
| "grad_norm": 0.1844940185546875, | |
| "kl": 0.002460479736328125, | |
| "learning_rate": 8.245653237555705e-07, | |
| "loss": 0.0001, | |
| "reward": 0.7340436186641455, | |
| "reward_std": 0.7672436386346817, | |
| "rewards/cosine_scaled_reward": 0.033688463270664215, | |
| "rewards/format_reward": 0.6666666865348816, | |
| "step": 181 | |
| }, | |
| { | |
| "completion_length": 1781.0208435058594, | |
| "epoch": 0.208, | |
| "grad_norm": 0.27145451307296753, | |
| "kl": 0.005458831787109375, | |
| "learning_rate": 8.220696016880687e-07, | |
| "loss": 0.0002, | |
| "reward": 0.6259329319000244, | |
| "reward_std": 0.7968147397041321, | |
| "rewards/cosine_scaled_reward": -0.062033540569245815, | |
| "rewards/format_reward": 0.7500000111758709, | |
| "step": 182 | |
| }, | |
| { | |
| "completion_length": 2343.8959045410156, | |
| "epoch": 0.20914285714285713, | |
| "grad_norm": 0.2297639399766922, | |
| "kl": 0.00566864013671875, | |
| "learning_rate": 8.195606193320136e-07, | |
| "loss": 0.0002, | |
| "reward": 0.11410272493958473, | |
| "reward_std": 0.5572097525000572, | |
| "rewards/cosine_scaled_reward": -0.2658653110265732, | |
| "rewards/format_reward": 0.6458333432674408, | |
| "step": 183 | |
| }, | |
| { | |
| "completion_length": 1841.2292175292969, | |
| "epoch": 0.2102857142857143, | |
| "grad_norm": 0.2628481388092041, | |
| "kl": 0.003875732421875, | |
| "learning_rate": 8.170384989716657e-07, | |
| "loss": 0.0002, | |
| "reward": 1.0498279109597206, | |
| "reward_std": 0.812163732945919, | |
| "rewards/cosine_scaled_reward": 0.1394973020069301, | |
| "rewards/format_reward": 0.770833358168602, | |
| "step": 184 | |
| }, | |
| { | |
| "completion_length": 2690.666748046875, | |
| "epoch": 0.21142857142857144, | |
| "grad_norm": 0.2707008719444275, | |
| "kl": 0.006504058837890625, | |
| "learning_rate": 8.145033635316128e-07, | |
| "loss": 0.0003, | |
| "reward": 0.12935106456279755, | |
| "reward_std": 0.6062737256288528, | |
| "rewards/cosine_scaled_reward": -0.15407447703182697, | |
| "rewards/format_reward": 0.4375000149011612, | |
| "step": 185 | |
| }, | |
| { | |
| "completion_length": 2648.7083740234375, | |
| "epoch": 0.21257142857142858, | |
| "grad_norm": 0.277004599571228, | |
| "kl": 0.00493621826171875, | |
| "learning_rate": 8.119553365707802e-07, | |
| "loss": 0.0002, | |
| "reward": 0.3933283071964979, | |
| "reward_std": 0.6029615625739098, | |
| "rewards/cosine_scaled_reward": -0.07416917383670807, | |
| "rewards/format_reward": 0.5416666865348816, | |
| "step": 186 | |
| }, | |
| { | |
| "completion_length": 2165.416778564453, | |
| "epoch": 0.21371428571428572, | |
| "grad_norm": 0.2298295795917511, | |
| "kl": 0.005840301513671875, | |
| "learning_rate": 8.093945422764069e-07, | |
| "loss": 0.0002, | |
| "reward": 0.7806095313280821, | |
| "reward_std": 0.7954358160495758, | |
| "rewards/cosine_scaled_reward": 0.025721419602632523, | |
| "rewards/format_reward": 0.7291666865348816, | |
| "step": 187 | |
| }, | |
| { | |
| "completion_length": 1475.4167022705078, | |
| "epoch": 0.21485714285714286, | |
| "grad_norm": 0.24691948294639587, | |
| "kl": 0.005809783935546875, | |
| "learning_rate": 8.068211054579943e-07, | |
| "loss": 0.0002, | |
| "reward": 1.0945345759391785, | |
| "reward_std": 0.8786479085683823, | |
| "rewards/cosine_scaled_reward": 0.10976729169487953, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 188 | |
| }, | |
| { | |
| "completion_length": 1539.8333740234375, | |
| "epoch": 0.216, | |
| "grad_norm": 0.27775290608406067, | |
| "kl": 0.0064697265625, | |
| "learning_rate": 8.04235151541222e-07, | |
| "loss": 0.0003, | |
| "reward": 0.6156105473637581, | |
| "reward_std": 0.7454669773578644, | |
| "rewards/cosine_scaled_reward": -0.119278060272336, | |
| "rewards/format_reward": 0.8541666865348816, | |
| "step": 189 | |
| }, | |
| { | |
| "completion_length": 2440.979278564453, | |
| "epoch": 0.21714285714285714, | |
| "grad_norm": 0.22604604065418243, | |
| "kl": 0.00592041015625, | |
| "learning_rate": 8.01636806561836e-07, | |
| "loss": 0.0002, | |
| "reward": 0.4239200847223401, | |
| "reward_std": 0.8845669329166412, | |
| "rewards/cosine_scaled_reward": -0.038039978593587875, | |
| "rewards/format_reward": 0.500000013038516, | |
| "step": 190 | |
| }, | |
| { | |
| "completion_length": 1904.729248046875, | |
| "epoch": 0.21828571428571428, | |
| "grad_norm": 0.2662159204483032, | |
| "kl": 0.00586700439453125, | |
| "learning_rate": 7.990261971595048e-07, | |
| "loss": 0.0002, | |
| "reward": 0.4985937252640724, | |
| "reward_std": 0.7315638810396194, | |
| "rewards/cosine_scaled_reward": -0.09445315971970558, | |
| "rewards/format_reward": 0.6875000074505806, | |
| "step": 191 | |
| }, | |
| { | |
| "completion_length": 2417.1875610351562, | |
| "epoch": 0.21942857142857142, | |
| "grad_norm": 0.27427810430526733, | |
| "kl": 0.009246826171875, | |
| "learning_rate": 7.964034505716476e-07, | |
| "loss": 0.0004, | |
| "reward": 0.3124541025608778, | |
| "reward_std": 0.6425688564777374, | |
| "rewards/cosine_scaled_reward": -0.12502295151352882, | |
| "rewards/format_reward": 0.5625000074505806, | |
| "step": 192 | |
| }, | |
| { | |
| "completion_length": 1865.0208740234375, | |
| "epoch": 0.22057142857142858, | |
| "grad_norm": 0.2562018930912018, | |
| "kl": 0.01080322265625, | |
| "learning_rate": 7.93768694627233e-07, | |
| "loss": 0.0004, | |
| "reward": 0.5529625415802002, | |
| "reward_std": 0.5897716134786606, | |
| "rewards/cosine_scaled_reward": -0.0985187292098999, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 193 | |
| }, | |
| { | |
| "completion_length": 1062.4375305175781, | |
| "epoch": 0.22171428571428572, | |
| "grad_norm": 0.22855594754219055, | |
| "kl": 0.0037994384765625, | |
| "learning_rate": 7.911220577405484e-07, | |
| "loss": 0.0002, | |
| "reward": 1.7262530326843262, | |
| "reward_std": 0.826399639248848, | |
| "rewards/cosine_scaled_reward": 0.3631264716386795, | |
| "rewards/format_reward": 1.0, | |
| "step": 194 | |
| }, | |
| { | |
| "completion_length": 1257.5625305175781, | |
| "epoch": 0.22285714285714286, | |
| "grad_norm": 0.26371893286705017, | |
| "kl": 0.00643157958984375, | |
| "learning_rate": 7.884636689049422e-07, | |
| "loss": 0.0003, | |
| "reward": 1.151860922574997, | |
| "reward_std": 0.6702793166041374, | |
| "rewards/cosine_scaled_reward": 0.12801377475261688, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 195 | |
| }, | |
| { | |
| "completion_length": 2729.5000610351562, | |
| "epoch": 0.224, | |
| "grad_norm": 0.2403058111667633, | |
| "kl": 0.00676727294921875, | |
| "learning_rate": 7.857936576865356e-07, | |
| "loss": 0.0003, | |
| "reward": 0.26913030445575714, | |
| "reward_std": 0.6797884181141853, | |
| "rewards/cosine_scaled_reward": -0.08418486639857292, | |
| "rewards/format_reward": 0.4375, | |
| "step": 196 | |
| }, | |
| { | |
| "completion_length": 2123.791717529297, | |
| "epoch": 0.22514285714285714, | |
| "grad_norm": 0.22864989936351776, | |
| "kl": 0.006134033203125, | |
| "learning_rate": 7.831121542179086e-07, | |
| "loss": 0.0002, | |
| "reward": 0.06347193196415901, | |
| "reward_std": 0.40899810940027237, | |
| "rewards/cosine_scaled_reward": -0.24951404333114624, | |
| "rewards/format_reward": 0.5625000149011612, | |
| "step": 197 | |
| }, | |
| { | |
| "completion_length": 2316.791748046875, | |
| "epoch": 0.22628571428571428, | |
| "grad_norm": 0.2166266292333603, | |
| "kl": 0.00603485107421875, | |
| "learning_rate": 7.804192891917571e-07, | |
| "loss": 0.0002, | |
| "reward": 0.4236091636121273, | |
| "reward_std": 0.794644683599472, | |
| "rewards/cosine_scaled_reward": -0.1215287372469902, | |
| "rewards/format_reward": 0.6666666865348816, | |
| "step": 198 | |
| }, | |
| { | |
| "completion_length": 2175.5416870117188, | |
| "epoch": 0.22742857142857142, | |
| "grad_norm": 0.2332044243812561, | |
| "kl": 0.005603790283203125, | |
| "learning_rate": 7.777151938545235e-07, | |
| "loss": 0.0002, | |
| "reward": 1.2629163265228271, | |
| "reward_std": 0.7567542046308517, | |
| "rewards/cosine_scaled_reward": 0.2252081297338009, | |
| "rewards/format_reward": 0.8125000298023224, | |
| "step": 199 | |
| }, | |
| { | |
| "completion_length": 1545.8125305175781, | |
| "epoch": 0.22857142857142856, | |
| "grad_norm": 0.3451651632785797, | |
| "kl": 0.006610870361328125, | |
| "learning_rate": 7.75e-07, | |
| "loss": 0.0003, | |
| "reward": 1.1884014122188091, | |
| "reward_std": 0.868816927075386, | |
| "rewards/cosine_scaled_reward": 0.1983673730865121, | |
| "rewards/format_reward": 0.7916666716337204, | |
| "step": 200 | |
| }, | |
| { | |
| "completion_length": 1524.0625610351562, | |
| "epoch": 0.2297142857142857, | |
| "grad_norm": 0.21861064434051514, | |
| "kl": 0.0051116943359375, | |
| "learning_rate": 7.72273839962904e-07, | |
| "loss": 0.0002, | |
| "reward": 0.675473814830184, | |
| "reward_std": 0.6859661787748337, | |
| "rewards/cosine_scaled_reward": -0.07892975211143494, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 201 | |
| }, | |
| { | |
| "completion_length": 1321.3959045410156, | |
| "epoch": 0.23085714285714284, | |
| "grad_norm": 0.24629908800125122, | |
| "kl": 0.007568359375, | |
| "learning_rate": 7.695368466124296e-07, | |
| "loss": 0.0003, | |
| "reward": 0.9415311962366104, | |
| "reward_std": 0.7775374501943588, | |
| "rewards/cosine_scaled_reward": 0.02284892648458481, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 202 | |
| }, | |
| { | |
| "completion_length": 1379.7709045410156, | |
| "epoch": 0.232, | |
| "grad_norm": 0.27627113461494446, | |
| "kl": 0.00753021240234375, | |
| "learning_rate": 7.667891533457718e-07, | |
| "loss": 0.0003, | |
| "reward": 1.1194992661476135, | |
| "reward_std": 0.7730197608470917, | |
| "rewards/cosine_scaled_reward": 0.09099959582090378, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 203 | |
| }, | |
| { | |
| "completion_length": 1120.1667022705078, | |
| "epoch": 0.23314285714285715, | |
| "grad_norm": 0.26729604601860046, | |
| "kl": 0.00640869140625, | |
| "learning_rate": 7.640308940816239e-07, | |
| "loss": 0.0003, | |
| "reward": 1.2363095879554749, | |
| "reward_std": 0.8477204591035843, | |
| "rewards/cosine_scaled_reward": 0.15982142463326454, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 204 | |
| }, | |
| { | |
| "completion_length": 1971.2291870117188, | |
| "epoch": 0.2342857142857143, | |
| "grad_norm": 0.2195984125137329, | |
| "kl": 0.00698089599609375, | |
| "learning_rate": 7.612622032536507e-07, | |
| "loss": 0.0003, | |
| "reward": 0.8974205702543259, | |
| "reward_std": 0.8895229697227478, | |
| "rewards/cosine_scaled_reward": 0.052876945585012436, | |
| "rewards/format_reward": 0.7916666716337204, | |
| "step": 205 | |
| }, | |
| { | |
| "completion_length": 2061.3334350585938, | |
| "epoch": 0.23542857142857143, | |
| "grad_norm": 0.268889844417572, | |
| "kl": 0.007232666015625, | |
| "learning_rate": 7.584832158039378e-07, | |
| "loss": 0.0003, | |
| "reward": 0.2433365173637867, | |
| "reward_std": 0.5611164793372154, | |
| "rewards/cosine_scaled_reward": -0.2324984148144722, | |
| "rewards/format_reward": 0.708333358168602, | |
| "step": 206 | |
| }, | |
| { | |
| "completion_length": 1904.5000305175781, | |
| "epoch": 0.23657142857142857, | |
| "grad_norm": 0.19144296646118164, | |
| "kl": 0.00501251220703125, | |
| "learning_rate": 7.556940671764124e-07, | |
| "loss": 0.0002, | |
| "reward": 0.9468748420476913, | |
| "reward_std": 0.6613385528326035, | |
| "rewards/cosine_scaled_reward": 0.05677075684070587, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 207 | |
| }, | |
| { | |
| "completion_length": 1996.8125610351562, | |
| "epoch": 0.2377142857142857, | |
| "grad_norm": 0.29941245913505554, | |
| "kl": 0.00930023193359375, | |
| "learning_rate": 7.528948933102438e-07, | |
| "loss": 0.0004, | |
| "reward": 0.29106441140174866, | |
| "reward_std": 0.6116437911987305, | |
| "rewards/cosine_scaled_reward": -0.17738447710871696, | |
| "rewards/format_reward": 0.6458333544433117, | |
| "step": 208 | |
| }, | |
| { | |
| "completion_length": 1285.9166870117188, | |
| "epoch": 0.23885714285714285, | |
| "grad_norm": 0.3566973805427551, | |
| "kl": 0.00695037841796875, | |
| "learning_rate": 7.500858306332172e-07, | |
| "loss": 0.0003, | |
| "reward": 0.5402148813009262, | |
| "reward_std": 0.7111145555973053, | |
| "rewards/cosine_scaled_reward": -0.14655922167003155, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 209 | |
| }, | |
| { | |
| "completion_length": 1275.5000305175781, | |
| "epoch": 0.24, | |
| "grad_norm": 0.2605917155742645, | |
| "kl": 0.006221771240234375, | |
| "learning_rate": 7.472670160550848e-07, | |
| "loss": 0.0002, | |
| "reward": 1.3301078528165817, | |
| "reward_std": 0.8438884019851685, | |
| "rewards/cosine_scaled_reward": 0.18588725943118334, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 210 | |
| }, | |
| { | |
| "completion_length": 2328.6251220703125, | |
| "epoch": 0.24114285714285713, | |
| "grad_norm": 0.1887713074684143, | |
| "kl": 0.0092010498046875, | |
| "learning_rate": 7.444385869608921e-07, | |
| "loss": 0.0004, | |
| "reward": 0.6505604535341263, | |
| "reward_std": 0.5875601321458817, | |
| "rewards/cosine_scaled_reward": -0.008053132332861423, | |
| "rewards/format_reward": 0.6666666865348816, | |
| "step": 211 | |
| }, | |
| { | |
| "completion_length": 1282.166732788086, | |
| "epoch": 0.2422857142857143, | |
| "grad_norm": 0.2532815635204315, | |
| "kl": 0.0076446533203125, | |
| "learning_rate": 7.416006812042827e-07, | |
| "loss": 0.0003, | |
| "reward": 1.1198171079158783, | |
| "reward_std": 0.8018105626106262, | |
| "rewards/cosine_scaled_reward": 0.11199186649173498, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 212 | |
| }, | |
| { | |
| "completion_length": 1855.0834045410156, | |
| "epoch": 0.24342857142857144, | |
| "grad_norm": 0.2252466082572937, | |
| "kl": 0.0081024169921875, | |
| "learning_rate": 7.387534371007797e-07, | |
| "loss": 0.0003, | |
| "reward": 0.2633536756038666, | |
| "reward_std": 0.42707760632038116, | |
| "rewards/cosine_scaled_reward": -0.2224898338317871, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 213 | |
| }, | |
| { | |
| "completion_length": 1971.8959045410156, | |
| "epoch": 0.24457142857142858, | |
| "grad_norm": 0.1935078501701355, | |
| "kl": 0.004627227783203125, | |
| "learning_rate": 7.358969934210438e-07, | |
| "loss": 0.0002, | |
| "reward": 0.7197396508418024, | |
| "reward_std": 0.6319501101970673, | |
| "rewards/cosine_scaled_reward": -0.056796859949827194, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 214 | |
| }, | |
| { | |
| "completion_length": 1413.3541870117188, | |
| "epoch": 0.24571428571428572, | |
| "grad_norm": 0.2069859504699707, | |
| "kl": 0.00742340087890625, | |
| "learning_rate": 7.330314893841101e-07, | |
| "loss": 0.0003, | |
| "reward": 1.0066201090812683, | |
| "reward_std": 0.9675450921058655, | |
| "rewards/cosine_scaled_reward": 0.03456003498286009, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 215 | |
| }, | |
| { | |
| "completion_length": 2199.2083435058594, | |
| "epoch": 0.24685714285714286, | |
| "grad_norm": 0.2034756988286972, | |
| "kl": 0.00646209716796875, | |
| "learning_rate": 7.301570646506027e-07, | |
| "loss": 0.0003, | |
| "reward": 0.47810695320367813, | |
| "reward_std": 0.7864377945661545, | |
| "rewards/cosine_scaled_reward": -0.12552986666560173, | |
| "rewards/format_reward": 0.7291666865348816, | |
| "step": 216 | |
| }, | |
| { | |
| "completion_length": 1527.8958740234375, | |
| "epoch": 0.248, | |
| "grad_norm": 0.2671460807323456, | |
| "kl": 0.006622314453125, | |
| "learning_rate": 7.27273859315928e-07, | |
| "loss": 0.0003, | |
| "reward": 0.7772237807512283, | |
| "reward_std": 0.6489354968070984, | |
| "rewards/cosine_scaled_reward": -0.038471437990665436, | |
| "rewards/format_reward": 0.8541666865348816, | |
| "step": 217 | |
| }, | |
| { | |
| "completion_length": 1419.3542175292969, | |
| "epoch": 0.24914285714285714, | |
| "grad_norm": 0.2513315677642822, | |
| "kl": 0.00665283203125, | |
| "learning_rate": 7.243820139034464e-07, | |
| "loss": 0.0003, | |
| "reward": 1.1047292775474489, | |
| "reward_std": 0.6393595859408379, | |
| "rewards/cosine_scaled_reward": 0.1252813059836626, | |
| "rewards/format_reward": 0.8541666716337204, | |
| "step": 218 | |
| }, | |
| { | |
| "completion_length": 1275.3125305175781, | |
| "epoch": 0.2502857142857143, | |
| "grad_norm": 0.2648639380931854, | |
| "kl": 0.00737762451171875, | |
| "learning_rate": 7.214816693576234e-07, | |
| "loss": 0.0003, | |
| "reward": 0.6924525499343872, | |
| "reward_std": 0.6107815653085709, | |
| "rewards/cosine_scaled_reward": -0.13294040283653885, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 219 | |
| }, | |
| { | |
| "completion_length": 1247.9791717529297, | |
| "epoch": 0.25142857142857145, | |
| "grad_norm": 0.22622907161712646, | |
| "kl": 0.006031036376953125, | |
| "learning_rate": 7.185729670371604e-07, | |
| "loss": 0.0002, | |
| "reward": 0.9419594034552574, | |
| "reward_std": 0.675844706594944, | |
| "rewards/cosine_scaled_reward": 0.023063029162585735, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 220 | |
| }, | |
| { | |
| "completion_length": 2002.0833740234375, | |
| "epoch": 0.25257142857142856, | |
| "grad_norm": 0.218685120344162, | |
| "kl": 0.0069580078125, | |
| "learning_rate": 7.156560487081051e-07, | |
| "loss": 0.0003, | |
| "reward": 0.6575891096144915, | |
| "reward_std": 0.6497488841414452, | |
| "rewards/cosine_scaled_reward": 0.026711229234933853, | |
| "rewards/format_reward": 0.6041666716337204, | |
| "step": 221 | |
| }, | |
| { | |
| "completion_length": 2015.6250915527344, | |
| "epoch": 0.2537142857142857, | |
| "grad_norm": 0.20831483602523804, | |
| "kl": 0.0071258544921875, | |
| "learning_rate": 7.127310565369415e-07, | |
| "loss": 0.0003, | |
| "reward": 0.14067217335104942, | |
| "reward_std": 0.48574624210596085, | |
| "rewards/cosine_scaled_reward": -0.28383059799671173, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 222 | |
| }, | |
| { | |
| "completion_length": 1517.1875305175781, | |
| "epoch": 0.25485714285714284, | |
| "grad_norm": 0.24125142395496368, | |
| "kl": 0.005817413330078125, | |
| "learning_rate": 7.097981330836616e-07, | |
| "loss": 0.0002, | |
| "reward": 0.6348569616675377, | |
| "reward_std": 0.5405807122588158, | |
| "rewards/cosine_scaled_reward": -0.09923820104449987, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 223 | |
| }, | |
| { | |
| "completion_length": 1914.0000305175781, | |
| "epoch": 0.256, | |
| "grad_norm": 0.2622263431549072, | |
| "kl": 0.00728607177734375, | |
| "learning_rate": 7.068574212948169e-07, | |
| "loss": 0.0003, | |
| "reward": 0.5525996647775173, | |
| "reward_std": 0.5521951243281364, | |
| "rewards/cosine_scaled_reward": -0.04661682341247797, | |
| "rewards/format_reward": 0.6458333432674408, | |
| "step": 224 | |
| }, | |
| { | |
| "completion_length": 1250.0000305175781, | |
| "epoch": 0.2571428571428571, | |
| "grad_norm": 0.2181866317987442, | |
| "kl": 0.00460052490234375, | |
| "learning_rate": 7.039090644965509e-07, | |
| "loss": 0.0002, | |
| "reward": 1.2948355674743652, | |
| "reward_std": 0.6228364408016205, | |
| "rewards/cosine_scaled_reward": 0.16825110744684935, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 225 | |
| }, | |
| { | |
| "completion_length": 1461.6666870117188, | |
| "epoch": 0.2582857142857143, | |
| "grad_norm": 0.36098772287368774, | |
| "kl": 0.00760650634765625, | |
| "learning_rate": 7.009532063876148e-07, | |
| "loss": 0.0003, | |
| "reward": 0.3821214698255062, | |
| "reward_std": 0.5764878466725349, | |
| "rewards/cosine_scaled_reward": -0.20477261394262314, | |
| "rewards/format_reward": 0.7916666716337204, | |
| "step": 226 | |
| }, | |
| { | |
| "completion_length": 1328.5416870117188, | |
| "epoch": 0.25942857142857145, | |
| "grad_norm": 0.27139514684677124, | |
| "kl": 0.00934600830078125, | |
| "learning_rate": 6.979899910323624e-07, | |
| "loss": 0.0004, | |
| "reward": 0.7815765663981438, | |
| "reward_std": 0.7309335023164749, | |
| "rewards/cosine_scaled_reward": -0.03629505028948188, | |
| "rewards/format_reward": 0.8541666865348816, | |
| "step": 227 | |
| }, | |
| { | |
| "completion_length": 1626.4167175292969, | |
| "epoch": 0.26057142857142856, | |
| "grad_norm": 0.23888561129570007, | |
| "kl": 0.00714874267578125, | |
| "learning_rate": 6.950195628537299e-07, | |
| "loss": 0.0003, | |
| "reward": 0.6099164858460426, | |
| "reward_std": 0.7778853923082352, | |
| "rewards/cosine_scaled_reward": -0.09087510220706463, | |
| "rewards/format_reward": 0.7916666716337204, | |
| "step": 228 | |
| }, | |
| { | |
| "completion_length": 1440.6875457763672, | |
| "epoch": 0.26171428571428573, | |
| "grad_norm": 0.2864842116832733, | |
| "kl": 0.0119171142578125, | |
| "learning_rate": 6.920420666261961e-07, | |
| "loss": 0.0005, | |
| "reward": 0.7616857700049877, | |
| "reward_std": 0.7498719990253448, | |
| "rewards/cosine_scaled_reward": -0.025407111272215843, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 229 | |
| }, | |
| { | |
| "completion_length": 1527.8958740234375, | |
| "epoch": 0.26285714285714284, | |
| "grad_norm": 0.2429640144109726, | |
| "kl": 0.00768280029296875, | |
| "learning_rate": 6.890576474687263e-07, | |
| "loss": 0.0003, | |
| "reward": 0.9651975035667419, | |
| "reward_std": 0.824803501367569, | |
| "rewards/cosine_scaled_reward": 0.06593206711113453, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 230 | |
| }, | |
| { | |
| "completion_length": 1934.2084045410156, | |
| "epoch": 0.264, | |
| "grad_norm": 0.3963330090045929, | |
| "kl": 0.0092926025390625, | |
| "learning_rate": 6.860664508377001e-07, | |
| "loss": 0.0004, | |
| "reward": 0.6242162762209773, | |
| "reward_std": 0.8598367348313332, | |
| "rewards/cosine_scaled_reward": -0.03164188005030155, | |
| "rewards/format_reward": 0.6875000149011612, | |
| "step": 231 | |
| }, | |
| { | |
| "completion_length": 1455.8333740234375, | |
| "epoch": 0.2651428571428571, | |
| "grad_norm": 0.23361533880233765, | |
| "kl": 0.00798797607421875, | |
| "learning_rate": 6.83068622519821e-07, | |
| "loss": 0.0003, | |
| "reward": 0.7301613166928291, | |
| "reward_std": 0.7596315294504166, | |
| "rewards/cosine_scaled_reward": -0.07241935143247247, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 232 | |
| }, | |
| { | |
| "completion_length": 2010.8333740234375, | |
| "epoch": 0.2662857142857143, | |
| "grad_norm": 0.2177191823720932, | |
| "kl": 0.0087738037109375, | |
| "learning_rate": 6.800643086250121e-07, | |
| "loss": 0.0004, | |
| "reward": 0.6955921053886414, | |
| "reward_std": 0.8746853768825531, | |
| "rewards/cosine_scaled_reward": -0.0480372947640717, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 233 | |
| }, | |
| { | |
| "completion_length": 1696.8541870117188, | |
| "epoch": 0.2674285714285714, | |
| "grad_norm": 0.2090214192867279, | |
| "kl": 0.008090972900390625, | |
| "learning_rate": 6.770536555792944e-07, | |
| "loss": 0.0003, | |
| "reward": 0.6760512292385101, | |
| "reward_std": 0.6585969775915146, | |
| "rewards/cosine_scaled_reward": -0.0578077242244035, | |
| "rewards/format_reward": 0.7916666716337204, | |
| "step": 234 | |
| }, | |
| { | |
| "completion_length": 1389.6667175292969, | |
| "epoch": 0.26857142857142857, | |
| "grad_norm": 0.2434709221124649, | |
| "kl": 0.006389617919921875, | |
| "learning_rate": 6.740368101176495e-07, | |
| "loss": 0.0003, | |
| "reward": 0.6067800773307681, | |
| "reward_std": 0.41319192945957184, | |
| "rewards/cosine_scaled_reward": -0.1028599888086319, | |
| "rewards/format_reward": 0.8125, | |
| "step": 235 | |
| }, | |
| { | |
| "completion_length": 1396.0625305175781, | |
| "epoch": 0.26971428571428574, | |
| "grad_norm": 0.23188619315624237, | |
| "kl": 0.0070953369140625, | |
| "learning_rate": 6.710139192768694e-07, | |
| "loss": 0.0003, | |
| "reward": 0.5102774500846863, | |
| "reward_std": 0.4952257424592972, | |
| "rewards/cosine_scaled_reward": -0.21361128613352776, | |
| "rewards/format_reward": 0.9375, | |
| "step": 236 | |
| }, | |
| { | |
| "completion_length": 1236.3958892822266, | |
| "epoch": 0.27085714285714285, | |
| "grad_norm": 0.20340608060359955, | |
| "kl": 0.00693511962890625, | |
| "learning_rate": 6.679851303883891e-07, | |
| "loss": 0.0003, | |
| "reward": 0.9460461437702179, | |
| "reward_std": 0.5520051866769791, | |
| "rewards/cosine_scaled_reward": -0.01656026765704155, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 237 | |
| }, | |
| { | |
| "completion_length": 1293.0208435058594, | |
| "epoch": 0.272, | |
| "grad_norm": 0.28505584597587585, | |
| "kl": 0.00867462158203125, | |
| "learning_rate": 6.649505910711058e-07, | |
| "loss": 0.0003, | |
| "reward": 0.6806494742631912, | |
| "reward_std": 0.7478103041648865, | |
| "rewards/cosine_scaled_reward": -0.11800861544907093, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 238 | |
| }, | |
| { | |
| "completion_length": 1973.0000915527344, | |
| "epoch": 0.27314285714285713, | |
| "grad_norm": 0.29544979333877563, | |
| "kl": 0.0121917724609375, | |
| "learning_rate": 6.619104492241847e-07, | |
| "loss": 0.0005, | |
| "reward": 0.5413463786244392, | |
| "reward_std": 0.7216374576091766, | |
| "rewards/cosine_scaled_reward": -0.10432682058308274, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 239 | |
| }, | |
| { | |
| "completion_length": 1372.6250305175781, | |
| "epoch": 0.2742857142857143, | |
| "grad_norm": 0.2549286484718323, | |
| "kl": 0.00878143310546875, | |
| "learning_rate": 6.588648530198504e-07, | |
| "loss": 0.0004, | |
| "reward": 1.4046210646629333, | |
| "reward_std": 0.5067310631275177, | |
| "rewards/cosine_scaled_reward": 0.2543938383460045, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 240 | |
| }, | |
| { | |
| "completion_length": 1410.0417175292969, | |
| "epoch": 0.2754285714285714, | |
| "grad_norm": 0.32747364044189453, | |
| "kl": 0.009124755859375, | |
| "learning_rate": 6.558139508961654e-07, | |
| "loss": 0.0004, | |
| "reward": 0.8229547590017319, | |
| "reward_std": 0.6901429891586304, | |
| "rewards/cosine_scaled_reward": -0.026022649370133877, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 241 | |
| }, | |
| { | |
| "completion_length": 1473.6875228881836, | |
| "epoch": 0.2765714285714286, | |
| "grad_norm": 0.3592205345630646, | |
| "kl": 0.012908935546875, | |
| "learning_rate": 6.527578915497951e-07, | |
| "loss": 0.0005, | |
| "reward": 0.7551293671131134, | |
| "reward_std": 0.663729339838028, | |
| "rewards/cosine_scaled_reward": -0.0599353089928627, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 242 | |
| }, | |
| { | |
| "completion_length": 1187.0625610351562, | |
| "epoch": 0.2777142857142857, | |
| "grad_norm": 0.22532759606838226, | |
| "kl": 0.0076751708984375, | |
| "learning_rate": 6.496968239287603e-07, | |
| "loss": 0.0003, | |
| "reward": 0.8520705178380013, | |
| "reward_std": 0.45504674315452576, | |
| "rewards/cosine_scaled_reward": -0.05313139781355858, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 243 | |
| }, | |
| { | |
| "completion_length": 1377.7291870117188, | |
| "epoch": 0.27885714285714286, | |
| "grad_norm": 0.2351863533258438, | |
| "kl": 0.00748443603515625, | |
| "learning_rate": 6.466308972251785e-07, | |
| "loss": 0.0003, | |
| "reward": 1.1842002123594284, | |
| "reward_std": 1.0244361460208893, | |
| "rewards/cosine_scaled_reward": 0.15460011083632708, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 244 | |
| }, | |
| { | |
| "completion_length": 1255.4167175292969, | |
| "epoch": 0.28, | |
| "grad_norm": 0.34309011697769165, | |
| "kl": 0.01132965087890625, | |
| "learning_rate": 6.435602608679916e-07, | |
| "loss": 0.0005, | |
| "reward": 0.8439341634511948, | |
| "reward_std": 0.6376037150621414, | |
| "rewards/cosine_scaled_reward": -0.025949583388864994, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 245 | |
| }, | |
| { | |
| "completion_length": 1144.6250305175781, | |
| "epoch": 0.28114285714285714, | |
| "grad_norm": 0.27464959025382996, | |
| "kl": 0.0088958740234375, | |
| "learning_rate": 6.404850645156841e-07, | |
| "loss": 0.0004, | |
| "reward": 0.9792153835296631, | |
| "reward_std": 0.5227902606129646, | |
| "rewards/cosine_scaled_reward": 2.4352222681045532e-05, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 246 | |
| }, | |
| { | |
| "completion_length": 985.2083587646484, | |
| "epoch": 0.2822857142857143, | |
| "grad_norm": 0.3299963176250458, | |
| "kl": 0.01165771484375, | |
| "learning_rate": 6.374054580489873e-07, | |
| "loss": 0.0005, | |
| "reward": 1.1308997794985771, | |
| "reward_std": 0.699935294687748, | |
| "rewards/cosine_scaled_reward": 0.09669988602399826, | |
| "rewards/format_reward": 0.9375, | |
| "step": 247 | |
| }, | |
| { | |
| "completion_length": 1719.0208740234375, | |
| "epoch": 0.2834285714285714, | |
| "grad_norm": 0.27527329325675964, | |
| "kl": 0.0107879638671875, | |
| "learning_rate": 6.343215915635761e-07, | |
| "loss": 0.0004, | |
| "reward": 0.808366172015667, | |
| "reward_std": 0.725908488035202, | |
| "rewards/cosine_scaled_reward": -0.012483585625886917, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 248 | |
| }, | |
| { | |
| "completion_length": 1970.2083740234375, | |
| "epoch": 0.2845714285714286, | |
| "grad_norm": 0.27165958285331726, | |
| "kl": 0.011810302734375, | |
| "learning_rate": 6.31233615362752e-07, | |
| "loss": 0.0005, | |
| "reward": 0.3851170837879181, | |
| "reward_std": 0.7313886731863022, | |
| "rewards/cosine_scaled_reward": -0.16160813719034195, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 249 | |
| }, | |
| { | |
| "completion_length": 1342.3750305175781, | |
| "epoch": 0.2857142857142857, | |
| "grad_norm": 0.2968628704547882, | |
| "kl": 0.00905609130859375, | |
| "learning_rate": 6.281416799501187e-07, | |
| "loss": 0.0004, | |
| "reward": 0.8386699110269547, | |
| "reward_std": 0.7474230378866196, | |
| "rewards/cosine_scaled_reward": -0.02858173381537199, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 250 | |
| }, | |
| { | |
| "completion_length": 1774.0208740234375, | |
| "epoch": 0.28685714285714287, | |
| "grad_norm": 0.39884844422340393, | |
| "kl": 0.0139312744140625, | |
| "learning_rate": 6.25045936022246e-07, | |
| "loss": 0.0006, | |
| "reward": 1.0918036848306656, | |
| "reward_std": 1.1108526289463043, | |
| "rewards/cosine_scaled_reward": 0.1917351707816124, | |
| "rewards/format_reward": 0.708333358168602, | |
| "step": 251 | |
| }, | |
| { | |
| "completion_length": 1604.7500610351562, | |
| "epoch": 0.288, | |
| "grad_norm": 0.32672712206840515, | |
| "kl": 0.0103912353515625, | |
| "learning_rate": 6.219465344613258e-07, | |
| "loss": 0.0004, | |
| "reward": 0.7797054275870323, | |
| "reward_std": 0.6628324761986732, | |
| "rewards/cosine_scaled_reward": -0.06848062574863434, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 252 | |
| }, | |
| { | |
| "completion_length": 1578.9167175292969, | |
| "epoch": 0.28914285714285715, | |
| "grad_norm": 0.3023277223110199, | |
| "kl": 0.01739501953125, | |
| "learning_rate": 6.188436263278172e-07, | |
| "loss": 0.0007, | |
| "reward": 0.7126835100352764, | |
| "reward_std": 0.786981999874115, | |
| "rewards/cosine_scaled_reward": -0.039491571485996246, | |
| "rewards/format_reward": 0.7916666716337204, | |
| "step": 253 | |
| }, | |
| { | |
| "completion_length": 1142.6458435058594, | |
| "epoch": 0.29028571428571426, | |
| "grad_norm": 0.4636065363883972, | |
| "kl": 0.0141448974609375, | |
| "learning_rate": 6.157373628530852e-07, | |
| "loss": 0.0006, | |
| "reward": 1.1053122580051422, | |
| "reward_std": 0.47931085526943207, | |
| "rewards/cosine_scaled_reward": 0.16723946738056839, | |
| "rewards/format_reward": 0.7708333432674408, | |
| "step": 254 | |
| }, | |
| { | |
| "completion_length": 1286.0833435058594, | |
| "epoch": 0.2914285714285714, | |
| "grad_norm": 0.28858116269111633, | |
| "kl": 0.009735107421875, | |
| "learning_rate": 6.126278954320294e-07, | |
| "loss": 0.0004, | |
| "reward": 1.1211883053183556, | |
| "reward_std": 0.4671914726495743, | |
| "rewards/cosine_scaled_reward": 0.09184413589537144, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 255 | |
| }, | |
| { | |
| "completion_length": 1213.2083740234375, | |
| "epoch": 0.2925714285714286, | |
| "grad_norm": 0.32043886184692383, | |
| "kl": 0.015838623046875, | |
| "learning_rate": 6.095153756157051e-07, | |
| "loss": 0.0006, | |
| "reward": 0.823145791888237, | |
| "reward_std": 0.5544667765498161, | |
| "rewards/cosine_scaled_reward": -0.0675937756896019, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 256 | |
| }, | |
| { | |
| "completion_length": 1316.8541870117188, | |
| "epoch": 0.2937142857142857, | |
| "grad_norm": 0.23760062456130981, | |
| "kl": 0.0098114013671875, | |
| "learning_rate": 6.06399955103937e-07, | |
| "loss": 0.0004, | |
| "reward": 0.49779732525348663, | |
| "reward_std": 0.46498920768499374, | |
| "rewards/cosine_scaled_reward": -0.23026802763342857, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 257 | |
| }, | |
| { | |
| "completion_length": 1751.2708435058594, | |
| "epoch": 0.2948571428571429, | |
| "grad_norm": 0.254151314496994, | |
| "kl": 0.0131378173828125, | |
| "learning_rate": 6.032817857379256e-07, | |
| "loss": 0.0005, | |
| "reward": 0.6079542301595211, | |
| "reward_std": 0.6472693011164665, | |
| "rewards/cosine_scaled_reward": -0.10227290168404579, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 258 | |
| }, | |
| { | |
| "completion_length": 978.6458435058594, | |
| "epoch": 0.296, | |
| "grad_norm": 0.29362747073173523, | |
| "kl": 0.00855255126953125, | |
| "learning_rate": 6.001610194928464e-07, | |
| "loss": 0.0003, | |
| "reward": 0.7723531350493431, | |
| "reward_std": 0.7641957998275757, | |
| "rewards/cosine_scaled_reward": -0.10340679436922073, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 259 | |
| }, | |
| { | |
| "completion_length": 1655.1875305175781, | |
| "epoch": 0.29714285714285715, | |
| "grad_norm": 0.38929465413093567, | |
| "kl": 0.0145721435546875, | |
| "learning_rate": 5.97037808470444e-07, | |
| "loss": 0.0006, | |
| "reward": 0.37256848718971014, | |
| "reward_std": 0.6071458011865616, | |
| "rewards/cosine_scaled_reward": -0.18871578108519316, | |
| "rewards/format_reward": 0.7500000298023224, | |
| "step": 260 | |
| }, | |
| { | |
| "completion_length": 1198.8333892822266, | |
| "epoch": 0.29828571428571427, | |
| "grad_norm": 0.2716739773750305, | |
| "kl": 0.0102081298828125, | |
| "learning_rate": 5.939123048916173e-07, | |
| "loss": 0.0004, | |
| "reward": 0.633615754544735, | |
| "reward_std": 0.586112380027771, | |
| "rewards/cosine_scaled_reward": -0.1519421450793743, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 261 | |
| }, | |
| { | |
| "completion_length": 1484.1042175292969, | |
| "epoch": 0.29942857142857143, | |
| "grad_norm": 0.2628130316734314, | |
| "kl": 0.0106353759765625, | |
| "learning_rate": 5.907846610890011e-07, | |
| "loss": 0.0004, | |
| "reward": 0.4845110587775707, | |
| "reward_std": 0.536693274974823, | |
| "rewards/cosine_scaled_reward": -0.18482780829071999, | |
| "rewards/format_reward": 0.8541666716337204, | |
| "step": 262 | |
| }, | |
| { | |
| "completion_length": 1064.0833587646484, | |
| "epoch": 0.30057142857142854, | |
| "grad_norm": 0.36321043968200684, | |
| "kl": 0.0098876953125, | |
| "learning_rate": 5.87655029499542e-07, | |
| "loss": 0.0004, | |
| "reward": 0.9257199168205261, | |
| "reward_std": 0.8942077457904816, | |
| "rewards/cosine_scaled_reward": -0.016306710429489613, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 263 | |
| }, | |
| { | |
| "completion_length": 1316.1042175292969, | |
| "epoch": 0.3017142857142857, | |
| "grad_norm": 0.26243484020233154, | |
| "kl": 0.00811004638671875, | |
| "learning_rate": 5.845235626570683e-07, | |
| "loss": 0.0003, | |
| "reward": 0.6479791402816772, | |
| "reward_std": 0.6652240380644798, | |
| "rewards/cosine_scaled_reward": -0.15517710940912366, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 264 | |
| }, | |
| { | |
| "completion_length": 1909.6250915527344, | |
| "epoch": 0.3028571428571429, | |
| "grad_norm": 0.2833210229873657, | |
| "kl": 0.01206207275390625, | |
| "learning_rate": 5.813904131848564e-07, | |
| "loss": 0.0005, | |
| "reward": 0.7709198147058487, | |
| "reward_std": 0.6459413915872574, | |
| "rewards/cosine_scaled_reward": 4.323199391365051e-05, | |
| "rewards/format_reward": 0.7708333432674408, | |
| "step": 265 | |
| }, | |
| { | |
| "completion_length": 1645.9583740234375, | |
| "epoch": 0.304, | |
| "grad_norm": 0.3645997941493988, | |
| "kl": 0.01513671875, | |
| "learning_rate": 5.78255733788191e-07, | |
| "loss": 0.0006, | |
| "reward": 0.4407457821071148, | |
| "reward_std": 0.6104073449969292, | |
| "rewards/cosine_scaled_reward": -0.1858771131373942, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 266 | |
| }, | |
| { | |
| "completion_length": 1644.0209045410156, | |
| "epoch": 0.30514285714285716, | |
| "grad_norm": 0.27599290013313293, | |
| "kl": 0.014923095703125, | |
| "learning_rate": 5.751196772469237e-07, | |
| "loss": 0.0006, | |
| "reward": 0.7639178857207298, | |
| "reward_std": 0.5990442484617233, | |
| "rewards/cosine_scaled_reward": -0.013874400407075882, | |
| "rewards/format_reward": 0.7916666716337204, | |
| "step": 267 | |
| }, | |
| { | |
| "completion_length": 1681.4583587646484, | |
| "epoch": 0.3062857142857143, | |
| "grad_norm": 0.2269752472639084, | |
| "kl": 0.01216888427734375, | |
| "learning_rate": 5.71982396408026e-07, | |
| "loss": 0.0005, | |
| "reward": 0.5377051346004009, | |
| "reward_std": 0.6556981913745403, | |
| "rewards/cosine_scaled_reward": -0.14781412575393915, | |
| "rewards/format_reward": 0.8333333358168602, | |
| "step": 268 | |
| }, | |
| { | |
| "completion_length": 1047.1250457763672, | |
| "epoch": 0.30742857142857144, | |
| "grad_norm": 0.3485262095928192, | |
| "kl": 0.01123046875, | |
| "learning_rate": 5.688440441781398e-07, | |
| "loss": 0.0005, | |
| "reward": 0.68592269718647, | |
| "reward_std": 0.5478300377726555, | |
| "rewards/cosine_scaled_reward": -0.13620532862842083, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 269 | |
| }, | |
| { | |
| "completion_length": 1012.0208587646484, | |
| "epoch": 0.30857142857142855, | |
| "grad_norm": 0.3116324543952942, | |
| "kl": 0.007354736328125, | |
| "learning_rate": 5.657047735161255e-07, | |
| "loss": 0.0003, | |
| "reward": 0.9341000914573669, | |
| "reward_std": 0.6274480298161507, | |
| "rewards/cosine_scaled_reward": -0.03294998221099377, | |
| "rewards/format_reward": 1.0, | |
| "step": 270 | |
| }, | |
| { | |
| "completion_length": 1732.6459350585938, | |
| "epoch": 0.3097142857142857, | |
| "grad_norm": 0.42208629846572876, | |
| "kl": 0.016632080078125, | |
| "learning_rate": 5.625647374256061e-07, | |
| "loss": 0.0007, | |
| "reward": 0.7596426885575056, | |
| "reward_std": 0.77412149310112, | |
| "rewards/cosine_scaled_reward": -0.0055953278206288815, | |
| "rewards/format_reward": 0.7708333432674408, | |
| "step": 271 | |
| }, | |
| { | |
| "completion_length": 2177.270965576172, | |
| "epoch": 0.31085714285714283, | |
| "grad_norm": 0.5940233469009399, | |
| "kl": 0.02764892578125, | |
| "learning_rate": 5.594240889475106e-07, | |
| "loss": 0.0011, | |
| "reward": 0.4223189577460289, | |
| "reward_std": 0.8890358135104179, | |
| "rewards/cosine_scaled_reward": -0.10134052112698555, | |
| "rewards/format_reward": 0.6250000223517418, | |
| "step": 272 | |
| }, | |
| { | |
| "completion_length": 1954.8333740234375, | |
| "epoch": 0.312, | |
| "grad_norm": 0.3138795495033264, | |
| "kl": 0.02874755859375, | |
| "learning_rate": 5.562829811526154e-07, | |
| "loss": 0.0012, | |
| "reward": 0.3654465600848198, | |
| "reward_std": 0.5650510713458061, | |
| "rewards/cosine_scaled_reward": -0.1506100632250309, | |
| "rewards/format_reward": 0.6666666865348816, | |
| "step": 273 | |
| }, | |
| { | |
| "completion_length": 1267.4583587646484, | |
| "epoch": 0.31314285714285717, | |
| "grad_norm": 0.274538516998291, | |
| "kl": 0.01172637939453125, | |
| "learning_rate": 5.531415671340826e-07, | |
| "loss": 0.0005, | |
| "reward": 0.837937019765377, | |
| "reward_std": 0.6332506015896797, | |
| "rewards/cosine_scaled_reward": -0.03936483711004257, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 274 | |
| }, | |
| { | |
| "completion_length": 2021.4583740234375, | |
| "epoch": 0.3142857142857143, | |
| "grad_norm": 0.30807891488075256, | |
| "kl": 0.02382659912109375, | |
| "learning_rate": 5.5e-07, | |
| "loss": 0.001, | |
| "reward": 0.9066380485892296, | |
| "reward_std": 0.9834412485361099, | |
| "rewards/cosine_scaled_reward": 0.1095690238289535, | |
| "rewards/format_reward": 0.6875000223517418, | |
| "step": 275 | |
| }, | |
| { | |
| "completion_length": 1135.8750305175781, | |
| "epoch": 0.31542857142857145, | |
| "grad_norm": 0.2739965319633484, | |
| "kl": 0.010345458984375, | |
| "learning_rate": 5.468584328659172e-07, | |
| "loss": 0.0004, | |
| "reward": 1.0236308723688126, | |
| "reward_std": 0.5140665993094444, | |
| "rewards/cosine_scaled_reward": 0.04306542640551925, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 276 | |
| }, | |
| { | |
| "completion_length": 1247.2500305175781, | |
| "epoch": 0.31657142857142856, | |
| "grad_norm": 0.37156689167022705, | |
| "kl": 0.00988006591796875, | |
| "learning_rate": 5.437170188473847e-07, | |
| "loss": 0.0004, | |
| "reward": 0.8927154019474983, | |
| "reward_std": 0.8232090175151825, | |
| "rewards/cosine_scaled_reward": -0.0015590004622936249, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 277 | |
| }, | |
| { | |
| "completion_length": 1309.3125610351562, | |
| "epoch": 0.3177142857142857, | |
| "grad_norm": 0.2857230603694916, | |
| "kl": 0.0121002197265625, | |
| "learning_rate": 5.405759110524894e-07, | |
| "loss": 0.0005, | |
| "reward": 0.942589208483696, | |
| "reward_std": 0.6875655725598335, | |
| "rewards/cosine_scaled_reward": 0.012961250729858875, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 278 | |
| }, | |
| { | |
| "completion_length": 1435.4791717529297, | |
| "epoch": 0.31885714285714284, | |
| "grad_norm": 0.43713676929473877, | |
| "kl": 0.01840972900390625, | |
| "learning_rate": 5.37435262574394e-07, | |
| "loss": 0.0007, | |
| "reward": 0.37974046915769577, | |
| "reward_std": 0.5385972559452057, | |
| "rewards/cosine_scaled_reward": -0.2163797914981842, | |
| "rewards/format_reward": 0.8125, | |
| "step": 279 | |
| }, | |
| { | |
| "completion_length": 1156.062515258789, | |
| "epoch": 0.32, | |
| "grad_norm": 0.2634413242340088, | |
| "kl": 0.0093536376953125, | |
| "learning_rate": 5.342952264838747e-07, | |
| "loss": 0.0004, | |
| "reward": 0.980226680636406, | |
| "reward_std": 0.5487889721989632, | |
| "rewards/cosine_scaled_reward": 0.0005300038028508425, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 280 | |
| }, | |
| { | |
| "completion_length": 1187.3125457763672, | |
| "epoch": 0.3211428571428571, | |
| "grad_norm": 0.32134756445884705, | |
| "kl": 0.008182525634765625, | |
| "learning_rate": 5.311559558218603e-07, | |
| "loss": 0.0003, | |
| "reward": 0.9637440145015717, | |
| "reward_std": 0.8387380540370941, | |
| "rewards/cosine_scaled_reward": 0.0027053444646298885, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 281 | |
| }, | |
| { | |
| "completion_length": 1651.604232788086, | |
| "epoch": 0.3222857142857143, | |
| "grad_norm": 0.37761881947517395, | |
| "kl": 0.0242919921875, | |
| "learning_rate": 5.28017603591974e-07, | |
| "loss": 0.001, | |
| "reward": 0.5987202003598213, | |
| "reward_std": 0.6141614019870758, | |
| "rewards/cosine_scaled_reward": -0.10688992030918598, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 282 | |
| }, | |
| { | |
| "completion_length": 1207.1250610351562, | |
| "epoch": 0.32342857142857145, | |
| "grad_norm": 0.3965184688568115, | |
| "kl": 0.0155487060546875, | |
| "learning_rate": 5.248803227530763e-07, | |
| "loss": 0.0006, | |
| "reward": 0.6064153388142586, | |
| "reward_std": 0.6052896529436111, | |
| "rewards/cosine_scaled_reward": -0.16554233682109043, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 283 | |
| }, | |
| { | |
| "completion_length": 1520.5625610351562, | |
| "epoch": 0.32457142857142857, | |
| "grad_norm": 0.4802095890045166, | |
| "kl": 0.01385498046875, | |
| "learning_rate": 5.21744266211809e-07, | |
| "loss": 0.0006, | |
| "reward": 0.6298409104347229, | |
| "reward_std": 0.7485504075884819, | |
| "rewards/cosine_scaled_reward": -0.10174621269106865, | |
| "rewards/format_reward": 0.833333358168602, | |
| "step": 284 | |
| }, | |
| { | |
| "completion_length": 1631.4166870117188, | |
| "epoch": 0.32571428571428573, | |
| "grad_norm": 0.5242775678634644, | |
| "kl": 0.0250244140625, | |
| "learning_rate": 5.186095868151436e-07, | |
| "loss": 0.001, | |
| "reward": 0.4431188479065895, | |
| "reward_std": 0.6355130672454834, | |
| "rewards/cosine_scaled_reward": -0.20552390813827515, | |
| "rewards/format_reward": 0.8541666865348816, | |
| "step": 285 | |
| }, | |
| { | |
| "completion_length": 1278.2917175292969, | |
| "epoch": 0.32685714285714285, | |
| "grad_norm": 0.2374052256345749, | |
| "kl": 0.009552001953125, | |
| "learning_rate": 5.154764373429315e-07, | |
| "loss": 0.0004, | |
| "reward": 0.9486149102449417, | |
| "reward_std": 0.6938119828701019, | |
| "rewards/cosine_scaled_reward": -0.025692567229270935, | |
| "rewards/format_reward": 1.0, | |
| "step": 286 | |
| }, | |
| { | |
| "completion_length": 1549.5208740234375, | |
| "epoch": 0.328, | |
| "grad_norm": 0.5496286749839783, | |
| "kl": 0.0344696044921875, | |
| "learning_rate": 5.123449705004581e-07, | |
| "loss": 0.0014, | |
| "reward": 0.6046592518687248, | |
| "reward_std": 0.7527553886175156, | |
| "rewards/cosine_scaled_reward": -0.12475371174514294, | |
| "rewards/format_reward": 0.8541666865348816, | |
| "step": 287 | |
| }, | |
| { | |
| "completion_length": 906.6041717529297, | |
| "epoch": 0.3291428571428571, | |
| "grad_norm": 0.40935972332954407, | |
| "kl": 0.014556884765625, | |
| "learning_rate": 5.09215338910999e-07, | |
| "loss": 0.0006, | |
| "reward": 1.1515108793973923, | |
| "reward_std": 0.670855775475502, | |
| "rewards/cosine_scaled_reward": 0.07575542479753494, | |
| "rewards/format_reward": 1.0, | |
| "step": 288 | |
| }, | |
| { | |
| "completion_length": 1302.3958587646484, | |
| "epoch": 0.3302857142857143, | |
| "grad_norm": 0.33956480026245117, | |
| "kl": 0.01055145263671875, | |
| "learning_rate": 5.060876951083828e-07, | |
| "loss": 0.0004, | |
| "reward": 0.8325834274291992, | |
| "reward_std": 0.469268262386322, | |
| "rewards/cosine_scaled_reward": -0.0628749430179596, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 289 | |
| }, | |
| { | |
| "completion_length": 1219.1667022705078, | |
| "epoch": 0.3314285714285714, | |
| "grad_norm": 0.5517088174819946, | |
| "kl": 0.0243072509765625, | |
| "learning_rate": 5.02962191529556e-07, | |
| "loss": 0.001, | |
| "reward": 1.3289316296577454, | |
| "reward_std": 0.8555012494325638, | |
| "rewards/cosine_scaled_reward": 0.1957157626748085, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 290 | |
| }, | |
| { | |
| "completion_length": 1010.8541870117188, | |
| "epoch": 0.3325714285714286, | |
| "grad_norm": 0.44879335165023804, | |
| "kl": 0.020660400390625, | |
| "learning_rate": 4.998389805071536e-07, | |
| "loss": 0.0008, | |
| "reward": 0.886590301990509, | |
| "reward_std": 0.6713649779558182, | |
| "rewards/cosine_scaled_reward": 0.00579514354467392, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 291 | |
| }, | |
| { | |
| "completion_length": 1223.687515258789, | |
| "epoch": 0.33371428571428574, | |
| "grad_norm": 0.3632715344429016, | |
| "kl": 0.0165252685546875, | |
| "learning_rate": 4.967182142620745e-07, | |
| "loss": 0.0007, | |
| "reward": 0.8943772986531258, | |
| "reward_std": 0.8097474128007889, | |
| "rewards/cosine_scaled_reward": -0.0007280493155121803, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 292 | |
| }, | |
| { | |
| "completion_length": 1374.0208740234375, | |
| "epoch": 0.33485714285714285, | |
| "grad_norm": 0.4008006155490875, | |
| "kl": 0.0230865478515625, | |
| "learning_rate": 4.93600044896063e-07, | |
| "loss": 0.0009, | |
| "reward": 0.6955159157514572, | |
| "reward_std": 0.6655403971672058, | |
| "rewards/cosine_scaled_reward": -0.11057540401816368, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 293 | |
| }, | |
| { | |
| "completion_length": 1269.2917022705078, | |
| "epoch": 0.336, | |
| "grad_norm": 0.20706599950790405, | |
| "kl": 0.00942230224609375, | |
| "learning_rate": 4.904846243842949e-07, | |
| "loss": 0.0004, | |
| "reward": 1.2973814010620117, | |
| "reward_std": 0.7491715997457504, | |
| "rewards/cosine_scaled_reward": 0.16952402517199516, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 294 | |
| }, | |
| { | |
| "completion_length": 2128.9375610351562, | |
| "epoch": 0.33714285714285713, | |
| "grad_norm": 0.3372494876384735, | |
| "kl": 0.03973388671875, | |
| "learning_rate": 4.873721045679706e-07, | |
| "loss": 0.0016, | |
| "reward": 0.4113108851015568, | |
| "reward_std": 0.6322937309741974, | |
| "rewards/cosine_scaled_reward": -0.11726122908294201, | |
| "rewards/format_reward": 0.6458333432674408, | |
| "step": 295 | |
| }, | |
| { | |
| "completion_length": 938.1042022705078, | |
| "epoch": 0.3382857142857143, | |
| "grad_norm": 0.3853781521320343, | |
| "kl": 0.01088714599609375, | |
| "learning_rate": 4.842626371469149e-07, | |
| "loss": 0.0004, | |
| "reward": 1.2917412221431732, | |
| "reward_std": 0.859960287809372, | |
| "rewards/cosine_scaled_reward": 0.16670391708612442, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 296 | |
| }, | |
| { | |
| "completion_length": 1234.5000305175781, | |
| "epoch": 0.3394285714285714, | |
| "grad_norm": 0.32493123412132263, | |
| "kl": 0.02001953125, | |
| "learning_rate": 4.811563736721829e-07, | |
| "loss": 0.0008, | |
| "reward": 0.5709987878799438, | |
| "reward_std": 0.5739526003599167, | |
| "rewards/cosine_scaled_reward": -0.1728339404799044, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 297 | |
| }, | |
| { | |
| "completion_length": 1737.8750305175781, | |
| "epoch": 0.3405714285714286, | |
| "grad_norm": 0.6288060545921326, | |
| "kl": 0.02978515625, | |
| "learning_rate": 4.780534655386743e-07, | |
| "loss": 0.0012, | |
| "reward": 0.7676291763782501, | |
| "reward_std": 0.6836749911308289, | |
| "rewards/cosine_scaled_reward": -0.0328520848415792, | |
| "rewards/format_reward": 0.833333358168602, | |
| "step": 298 | |
| }, | |
| { | |
| "completion_length": 1495.9583740234375, | |
| "epoch": 0.3417142857142857, | |
| "grad_norm": 0.5687032341957092, | |
| "kl": 0.02923583984375, | |
| "learning_rate": 4.749540639777539e-07, | |
| "loss": 0.0012, | |
| "reward": 0.7650880664587021, | |
| "reward_std": 0.7526437044143677, | |
| "rewards/cosine_scaled_reward": -0.0757893230766058, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 299 | |
| }, | |
| { | |
| "completion_length": 1482.9167022705078, | |
| "epoch": 0.34285714285714286, | |
| "grad_norm": 0.307375431060791, | |
| "kl": 0.0286102294921875, | |
| "learning_rate": 4.7185832004988133e-07, | |
| "loss": 0.0011, | |
| "reward": 1.0939996913075447, | |
| "reward_std": 0.6226199977099895, | |
| "rewards/cosine_scaled_reward": 0.10949981957674026, | |
| "rewards/format_reward": 0.875, | |
| "step": 300 | |
| }, | |
| { | |
| "completion_length": 1668.7708740234375, | |
| "epoch": 0.344, | |
| "grad_norm": 0.3655478358268738, | |
| "kl": 0.0385589599609375, | |
| "learning_rate": 4.68766384637248e-07, | |
| "loss": 0.0015, | |
| "reward": 0.9393804222345352, | |
| "reward_std": 0.5472998023033142, | |
| "rewards/cosine_scaled_reward": 0.04260684549808502, | |
| "rewards/format_reward": 0.8541666865348816, | |
| "step": 301 | |
| }, | |
| { | |
| "completion_length": 1460.1667175292969, | |
| "epoch": 0.34514285714285714, | |
| "grad_norm": 0.5648999810218811, | |
| "kl": 0.0326690673828125, | |
| "learning_rate": 4.656784084364238e-07, | |
| "loss": 0.0013, | |
| "reward": 0.8058248609304428, | |
| "reward_std": 0.5371805727481842, | |
| "rewards/cosine_scaled_reward": -0.034587569534778595, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 302 | |
| }, | |
| { | |
| "completion_length": 1176.3958740234375, | |
| "epoch": 0.3462857142857143, | |
| "grad_norm": 0.2893832325935364, | |
| "kl": 0.01714324951171875, | |
| "learning_rate": 4.6259454195101267e-07, | |
| "loss": 0.0007, | |
| "reward": 0.9501863233745098, | |
| "reward_std": 0.7773154973983765, | |
| "rewards/cosine_scaled_reward": 0.006343139801174402, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 303 | |
| }, | |
| { | |
| "completion_length": 1049.2708587646484, | |
| "epoch": 0.3474285714285714, | |
| "grad_norm": 0.3396141231060028, | |
| "kl": 0.017486572265625, | |
| "learning_rate": 4.59514935484316e-07, | |
| "loss": 0.0007, | |
| "reward": 1.3814565241336823, | |
| "reward_std": 0.7759077772498131, | |
| "rewards/cosine_scaled_reward": 0.21156160347163677, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 304 | |
| }, | |
| { | |
| "completion_length": 1675.1667175292969, | |
| "epoch": 0.3485714285714286, | |
| "grad_norm": 0.32787981629371643, | |
| "kl": 0.0314483642578125, | |
| "learning_rate": 4.5643973913200837e-07, | |
| "loss": 0.0013, | |
| "reward": 0.7963635921478271, | |
| "reward_std": 0.7029048502445221, | |
| "rewards/cosine_scaled_reward": -0.018484866246581078, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 305 | |
| }, | |
| { | |
| "completion_length": 1478.7916870117188, | |
| "epoch": 0.3497142857142857, | |
| "grad_norm": 0.7724860310554504, | |
| "kl": 0.04205322265625, | |
| "learning_rate": 4.5336910277482155e-07, | |
| "loss": 0.0017, | |
| "reward": 0.934022843837738, | |
| "reward_std": 0.6418131068348885, | |
| "rewards/cosine_scaled_reward": 0.039928069338202477, | |
| "rewards/format_reward": 0.8541666716337204, | |
| "step": 306 | |
| }, | |
| { | |
| "completion_length": 1466.062515258789, | |
| "epoch": 0.35085714285714287, | |
| "grad_norm": 0.7901880741119385, | |
| "kl": 0.048187255859375, | |
| "learning_rate": 4.503031760712397e-07, | |
| "loss": 0.0019, | |
| "reward": 1.208159700036049, | |
| "reward_std": 0.9337977021932602, | |
| "rewards/cosine_scaled_reward": 0.16657985746860504, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 307 | |
| }, | |
| { | |
| "completion_length": 1011.0, | |
| "epoch": 0.352, | |
| "grad_norm": 0.43838247656822205, | |
| "kl": 0.02608489990234375, | |
| "learning_rate": 4.4724210845020494e-07, | |
| "loss": 0.001, | |
| "reward": 0.8291152790188789, | |
| "reward_std": 0.4770050719380379, | |
| "rewards/cosine_scaled_reward": -0.022942371666431427, | |
| "rewards/format_reward": 0.875, | |
| "step": 308 | |
| }, | |
| { | |
| "completion_length": 1304.812515258789, | |
| "epoch": 0.35314285714285715, | |
| "grad_norm": 0.34449389576911926, | |
| "kl": 0.027618408203125, | |
| "learning_rate": 4.441860491038345e-07, | |
| "loss": 0.0011, | |
| "reward": 0.9580995887517929, | |
| "reward_std": 0.9287254959344864, | |
| "rewards/cosine_scaled_reward": 0.020716451108455658, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 309 | |
| }, | |
| { | |
| "completion_length": 1587.1875610351562, | |
| "epoch": 0.35428571428571426, | |
| "grad_norm": 0.3894862234592438, | |
| "kl": 0.0413360595703125, | |
| "learning_rate": 4.4113514698014953e-07, | |
| "loss": 0.0017, | |
| "reward": 0.967779666185379, | |
| "reward_std": 0.49595198780298233, | |
| "rewards/cosine_scaled_reward": 0.04638980980962515, | |
| "rewards/format_reward": 0.875, | |
| "step": 310 | |
| }, | |
| { | |
| "completion_length": 1005.9583587646484, | |
| "epoch": 0.3554285714285714, | |
| "grad_norm": 0.5111984610557556, | |
| "kl": 0.0352020263671875, | |
| "learning_rate": 4.3808955077581546e-07, | |
| "loss": 0.0014, | |
| "reward": 0.8524645194411278, | |
| "reward_std": 0.6346431374549866, | |
| "rewards/cosine_scaled_reward": -0.042517755180597305, | |
| "rewards/format_reward": 0.9375, | |
| "step": 311 | |
| }, | |
| { | |
| "completion_length": 1219.1042175292969, | |
| "epoch": 0.3565714285714286, | |
| "grad_norm": 0.24494509398937225, | |
| "kl": 0.0164337158203125, | |
| "learning_rate": 4.350494089288943e-07, | |
| "loss": 0.0007, | |
| "reward": 1.1846633851528168, | |
| "reward_std": 0.8452874422073364, | |
| "rewards/cosine_scaled_reward": 0.09233169769868255, | |
| "rewards/format_reward": 1.0, | |
| "step": 312 | |
| }, | |
| { | |
| "completion_length": 1772.6459045410156, | |
| "epoch": 0.3577142857142857, | |
| "grad_norm": 0.32232236862182617, | |
| "kl": 0.05014801025390625, | |
| "learning_rate": 4.3201486961161093e-07, | |
| "loss": 0.002, | |
| "reward": 0.7452734671533108, | |
| "reward_std": 0.5130416378378868, | |
| "rewards/cosine_scaled_reward": -0.09611329552717507, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 313 | |
| }, | |
| { | |
| "completion_length": 1450.9167022705078, | |
| "epoch": 0.3588571428571429, | |
| "grad_norm": 0.7095328569412231, | |
| "kl": 0.04193115234375, | |
| "learning_rate": 4.2898608072313045e-07, | |
| "loss": 0.0017, | |
| "reward": 0.695801317691803, | |
| "reward_std": 0.6888641864061356, | |
| "rewards/cosine_scaled_reward": -0.07918267324566841, | |
| "rewards/format_reward": 0.8541666865348816, | |
| "step": 314 | |
| }, | |
| { | |
| "completion_length": 890.2083435058594, | |
| "epoch": 0.36, | |
| "grad_norm": 0.8648350238800049, | |
| "kl": 0.02002716064453125, | |
| "learning_rate": 4.2596318988235037e-07, | |
| "loss": 0.0008, | |
| "reward": 0.8261366635560989, | |
| "reward_std": 0.5876666381955147, | |
| "rewards/cosine_scaled_reward": -0.07651501428335905, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 315 | |
| }, | |
| { | |
| "completion_length": 1495.2917175292969, | |
| "epoch": 0.36114285714285715, | |
| "grad_norm": 0.7055426239967346, | |
| "kl": 0.0330047607421875, | |
| "learning_rate": 4.2294634442070553e-07, | |
| "loss": 0.0013, | |
| "reward": 0.5078188478946686, | |
| "reward_std": 0.6738529801368713, | |
| "rewards/cosine_scaled_reward": -0.18359058536589146, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 316 | |
| }, | |
| { | |
| "completion_length": 2028.3959350585938, | |
| "epoch": 0.36228571428571427, | |
| "grad_norm": 0.6188425421714783, | |
| "kl": 0.1023101806640625, | |
| "learning_rate": 4.1993569137498776e-07, | |
| "loss": 0.0041, | |
| "reward": 0.6971778050065041, | |
| "reward_std": 0.6844265758991241, | |
| "rewards/cosine_scaled_reward": 0.015255570411682129, | |
| "rewards/format_reward": 0.6666666865348816, | |
| "step": 317 | |
| }, | |
| { | |
| "completion_length": 1409.0833740234375, | |
| "epoch": 0.36342857142857143, | |
| "grad_norm": 0.4974954426288605, | |
| "kl": 0.04302978515625, | |
| "learning_rate": 4.1693137748017915e-07, | |
| "loss": 0.0017, | |
| "reward": 0.9140654609072953, | |
| "reward_std": 0.590464636683464, | |
| "rewards/cosine_scaled_reward": 0.040366058237850666, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 318 | |
| }, | |
| { | |
| "completion_length": 1569.2500305175781, | |
| "epoch": 0.36457142857142855, | |
| "grad_norm": 0.3885625898838043, | |
| "kl": 0.0399017333984375, | |
| "learning_rate": 4.1393354916230005e-07, | |
| "loss": 0.0016, | |
| "reward": 0.7486050575971603, | |
| "reward_std": 0.9239681512117386, | |
| "rewards/cosine_scaled_reward": -0.08403081598225981, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 319 | |
| }, | |
| { | |
| "completion_length": 1059.1875305175781, | |
| "epoch": 0.3657142857142857, | |
| "grad_norm": 0.5842506289482117, | |
| "kl": 0.0222625732421875, | |
| "learning_rate": 4.1094235253127374e-07, | |
| "loss": 0.0009, | |
| "reward": 0.872907280921936, | |
| "reward_std": 0.7717489525675774, | |
| "rewards/cosine_scaled_reward": -0.0635463809594512, | |
| "rewards/format_reward": 1.0, | |
| "step": 320 | |
| }, | |
| { | |
| "completion_length": 1675.8334045410156, | |
| "epoch": 0.3668571428571429, | |
| "grad_norm": 1.7648111581802368, | |
| "kl": 0.10723876953125, | |
| "learning_rate": 4.079579333738039e-07, | |
| "loss": 0.0043, | |
| "reward": 0.6553980484604836, | |
| "reward_std": 0.6528958007693291, | |
| "rewards/cosine_scaled_reward": -0.06813432276248932, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 321 | |
| }, | |
| { | |
| "completion_length": 1275.2708587646484, | |
| "epoch": 0.368, | |
| "grad_norm": 0.6659455895423889, | |
| "kl": 0.0338134765625, | |
| "learning_rate": 4.0498043714627006e-07, | |
| "loss": 0.0013, | |
| "reward": 1.113847702741623, | |
| "reward_std": 0.7288860827684402, | |
| "rewards/cosine_scaled_reward": 0.14025717787444592, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 322 | |
| }, | |
| { | |
| "completion_length": 1612.9166717529297, | |
| "epoch": 0.36914285714285716, | |
| "grad_norm": 0.6660499572753906, | |
| "kl": 0.0994873046875, | |
| "learning_rate": 4.020100089676376e-07, | |
| "loss": 0.004, | |
| "reward": 1.0578741058707237, | |
| "reward_std": 0.49694500118494034, | |
| "rewards/cosine_scaled_reward": 0.09143703989684582, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 323 | |
| }, | |
| { | |
| "completion_length": 1938.4584045410156, | |
| "epoch": 0.3702857142857143, | |
| "grad_norm": 1.3754856586456299, | |
| "kl": 0.1189422607421875, | |
| "learning_rate": 3.9904679361238526e-07, | |
| "loss": 0.0047, | |
| "reward": 0.27103549893945456, | |
| "reward_std": 0.5563001856207848, | |
| "rewards/cosine_scaled_reward": -0.23948227241635323, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 324 | |
| }, | |
| { | |
| "completion_length": 953.395881652832, | |
| "epoch": 0.37142857142857144, | |
| "grad_norm": 0.36472955346107483, | |
| "kl": 0.0133514404296875, | |
| "learning_rate": 3.9609093550344907e-07, | |
| "loss": 0.0005, | |
| "reward": 1.0393343269824982, | |
| "reward_std": 0.7779577225446701, | |
| "rewards/cosine_scaled_reward": 0.01966716069728136, | |
| "rewards/format_reward": 1.0, | |
| "step": 325 | |
| }, | |
| { | |
| "completion_length": 1773.9167175292969, | |
| "epoch": 0.37257142857142855, | |
| "grad_norm": 1.3614015579223633, | |
| "kl": 0.09857177734375, | |
| "learning_rate": 3.931425787051832e-07, | |
| "loss": 0.0039, | |
| "reward": 0.4826292358338833, | |
| "reward_std": 0.624052882194519, | |
| "rewards/cosine_scaled_reward": -0.1753520662896335, | |
| "rewards/format_reward": 0.833333358168602, | |
| "step": 326 | |
| }, | |
| { | |
| "completion_length": 1750.354232788086, | |
| "epoch": 0.3737142857142857, | |
| "grad_norm": 1.2214010953903198, | |
| "kl": 0.092681884765625, | |
| "learning_rate": 3.902018669163384e-07, | |
| "loss": 0.0037, | |
| "reward": 0.6829137187451124, | |
| "reward_std": 0.5776621401309967, | |
| "rewards/cosine_scaled_reward": -0.05437648296356201, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 327 | |
| }, | |
| { | |
| "completion_length": 1816.6250610351562, | |
| "epoch": 0.37485714285714283, | |
| "grad_norm": 0.6874251961708069, | |
| "kl": 0.08489990234375, | |
| "learning_rate": 3.872689434630585e-07, | |
| "loss": 0.0034, | |
| "reward": 0.796313688158989, | |
| "reward_std": 1.0243088752031326, | |
| "rewards/cosine_scaled_reward": -0.008093174546957016, | |
| "rewards/format_reward": 0.8125000298023224, | |
| "step": 328 | |
| }, | |
| { | |
| "completion_length": 2015.5000305175781, | |
| "epoch": 0.376, | |
| "grad_norm": 1.2615182399749756, | |
| "kl": 0.15081787109375, | |
| "learning_rate": 3.843439512918949e-07, | |
| "loss": 0.006, | |
| "reward": 0.6527662584558129, | |
| "reward_std": 0.7077807486057281, | |
| "rewards/cosine_scaled_reward": -0.09028353914618492, | |
| "rewards/format_reward": 0.833333358168602, | |
| "step": 329 | |
| }, | |
| { | |
| "completion_length": 2259.291717529297, | |
| "epoch": 0.37714285714285717, | |
| "grad_norm": 1.1345741748809814, | |
| "kl": 0.1787109375, | |
| "learning_rate": 3.8142703296283953e-07, | |
| "loss": 0.0071, | |
| "reward": 0.5901373848319054, | |
| "reward_std": 0.9298846423625946, | |
| "rewards/cosine_scaled_reward": -0.03826466016471386, | |
| "rewards/format_reward": 0.6666666865348816, | |
| "step": 330 | |
| }, | |
| { | |
| "completion_length": 2008.9584045410156, | |
| "epoch": 0.3782857142857143, | |
| "grad_norm": 1.42428719997406, | |
| "kl": 0.228515625, | |
| "learning_rate": 3.785183306423767e-07, | |
| "loss": 0.0092, | |
| "reward": 0.315967773552984, | |
| "reward_std": 0.7076915055513382, | |
| "rewards/cosine_scaled_reward": -0.19618277810513973, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 331 | |
| }, | |
| { | |
| "completion_length": 1344.7917175292969, | |
| "epoch": 0.37942857142857145, | |
| "grad_norm": 1.2130067348480225, | |
| "kl": 0.17629241943359375, | |
| "learning_rate": 3.7561798609655373e-07, | |
| "loss": 0.007, | |
| "reward": 1.3930619359016418, | |
| "reward_std": 0.7922802865505219, | |
| "rewards/cosine_scaled_reward": 0.26944761723279953, | |
| "rewards/format_reward": 0.8541666716337204, | |
| "step": 332 | |
| }, | |
| { | |
| "completion_length": 1645.3750305175781, | |
| "epoch": 0.38057142857142856, | |
| "grad_norm": 1.1035624742507935, | |
| "kl": 0.212158203125, | |
| "learning_rate": 3.72726140684072e-07, | |
| "loss": 0.0085, | |
| "reward": 0.28750851564109325, | |
| "reward_std": 0.3911990597844124, | |
| "rewards/cosine_scaled_reward": -0.252079077064991, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 333 | |
| }, | |
| { | |
| "completion_length": 1476.8333740234375, | |
| "epoch": 0.38171428571428573, | |
| "grad_norm": 1.2739105224609375, | |
| "kl": 0.1046142578125, | |
| "learning_rate": 3.6984293534939737e-07, | |
| "loss": 0.0042, | |
| "reward": 0.836659163236618, | |
| "reward_std": 0.8671058118343353, | |
| "rewards/cosine_scaled_reward": -0.04000374022871256, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 334 | |
| }, | |
| { | |
| "completion_length": 1723.3333892822266, | |
| "epoch": 0.38285714285714284, | |
| "grad_norm": 1.672484040260315, | |
| "kl": 0.21124267578125, | |
| "learning_rate": 3.6696851061588994e-07, | |
| "loss": 0.0084, | |
| "reward": 0.5473275234689936, | |
| "reward_std": 0.6023431569337845, | |
| "rewards/cosine_scaled_reward": -0.11175291612744331, | |
| "rewards/format_reward": 0.770833358168602, | |
| "step": 335 | |
| }, | |
| { | |
| "completion_length": 1291.8542175292969, | |
| "epoch": 0.384, | |
| "grad_norm": 1.1949892044067383, | |
| "kl": 0.13958740234375, | |
| "learning_rate": 3.641030065789562e-07, | |
| "loss": 0.0056, | |
| "reward": 0.43108636140823364, | |
| "reward_std": 0.42728982865810394, | |
| "rewards/cosine_scaled_reward": -0.22195683978497982, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 336 | |
| }, | |
| { | |
| "completion_length": 1024.3333740234375, | |
| "epoch": 0.3851428571428571, | |
| "grad_norm": 0.902114748954773, | |
| "kl": 0.11954498291015625, | |
| "learning_rate": 3.612465628992203e-07, | |
| "loss": 0.0048, | |
| "reward": 1.0072421729564667, | |
| "reward_std": 0.6713532879948616, | |
| "rewards/cosine_scaled_reward": 0.045287732034921646, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 337 | |
| }, | |
| { | |
| "completion_length": 1448.9375457763672, | |
| "epoch": 0.3862857142857143, | |
| "grad_norm": 1.2535922527313232, | |
| "kl": 0.1561737060546875, | |
| "learning_rate": 3.5839931879571725e-07, | |
| "loss": 0.0062, | |
| "reward": 0.5455589033663273, | |
| "reward_std": 0.535648949444294, | |
| "rewards/cosine_scaled_reward": -0.13347055204212666, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 338 | |
| }, | |
| { | |
| "completion_length": 2035.6459045410156, | |
| "epoch": 0.38742857142857146, | |
| "grad_norm": 0.746334969997406, | |
| "kl": 0.178955078125, | |
| "learning_rate": 3.555614130391079e-07, | |
| "loss": 0.0071, | |
| "reward": 0.593096449971199, | |
| "reward_std": 0.948510006070137, | |
| "rewards/cosine_scaled_reward": -0.08886844478547573, | |
| "rewards/format_reward": 0.770833358168602, | |
| "step": 339 | |
| }, | |
| { | |
| "completion_length": 1074.1041870117188, | |
| "epoch": 0.38857142857142857, | |
| "grad_norm": 0.5353119373321533, | |
| "kl": 0.041778564453125, | |
| "learning_rate": 3.5273298394491515e-07, | |
| "loss": 0.0017, | |
| "reward": 1.1473649591207504, | |
| "reward_std": 0.7414836436510086, | |
| "rewards/cosine_scaled_reward": 0.07368248514831066, | |
| "rewards/format_reward": 1.0, | |
| "step": 340 | |
| }, | |
| { | |
| "completion_length": 1197.7083892822266, | |
| "epoch": 0.38971428571428574, | |
| "grad_norm": 1.1976772546768188, | |
| "kl": 0.1300048828125, | |
| "learning_rate": 3.4991416936678276e-07, | |
| "loss": 0.0052, | |
| "reward": 1.3422877192497253, | |
| "reward_std": 0.6801351606845856, | |
| "rewards/cosine_scaled_reward": 0.1919771609827876, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 341 | |
| }, | |
| { | |
| "completion_length": 1645.479248046875, | |
| "epoch": 0.39085714285714285, | |
| "grad_norm": 1.725953459739685, | |
| "kl": 0.33575439453125, | |
| "learning_rate": 3.471051066897562e-07, | |
| "loss": 0.0135, | |
| "reward": 0.45842229574918747, | |
| "reward_std": 0.5379992946982384, | |
| "rewards/cosine_scaled_reward": -0.1457888763397932, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 342 | |
| }, | |
| { | |
| "completion_length": 1599.0416870117188, | |
| "epoch": 0.392, | |
| "grad_norm": 1.9615150690078735, | |
| "kl": 0.2664794921875, | |
| "learning_rate": 3.4430593282358777e-07, | |
| "loss": 0.0107, | |
| "reward": 0.8948207944631577, | |
| "reward_std": 0.6805202513933182, | |
| "rewards/cosine_scaled_reward": 0.009910388849675655, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 343 | |
| }, | |
| { | |
| "completion_length": 1742.8750610351562, | |
| "epoch": 0.3931428571428571, | |
| "grad_norm": 41.373931884765625, | |
| "kl": 0.8536376953125, | |
| "learning_rate": 3.4151678419606233e-07, | |
| "loss": 0.0343, | |
| "reward": 0.8157278522849083, | |
| "reward_std": 0.7099937200546265, | |
| "rewards/cosine_scaled_reward": -0.029636098071932793, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 344 | |
| }, | |
| { | |
| "completion_length": 1436.8125610351562, | |
| "epoch": 0.3942857142857143, | |
| "grad_norm": 1.8816604614257812, | |
| "kl": 0.26708984375, | |
| "learning_rate": 3.387377967463493e-07, | |
| "loss": 0.0107, | |
| "reward": 0.9264494627714157, | |
| "reward_std": 0.6063774973154068, | |
| "rewards/cosine_scaled_reward": 0.015308059751987457, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 345 | |
| }, | |
| { | |
| "completion_length": 1363.9792022705078, | |
| "epoch": 0.3954285714285714, | |
| "grad_norm": 2.3164970874786377, | |
| "kl": 0.09549713134765625, | |
| "learning_rate": 3.359691059183761e-07, | |
| "loss": 0.0038, | |
| "reward": 1.0577785670757294, | |
| "reward_std": 0.8408964425325394, | |
| "rewards/cosine_scaled_reward": 0.07055594399571419, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 346 | |
| }, | |
| { | |
| "completion_length": 1337.7083892822266, | |
| "epoch": 0.3965714285714286, | |
| "grad_norm": 1.1808565855026245, | |
| "kl": 0.2160491943359375, | |
| "learning_rate": 3.3321084665422803e-07, | |
| "loss": 0.0086, | |
| "reward": 1.5314601063728333, | |
| "reward_std": 0.8420404642820358, | |
| "rewards/cosine_scaled_reward": 0.31781339878216386, | |
| "rewards/format_reward": 0.895833358168602, | |
| "step": 347 | |
| }, | |
| { | |
| "completion_length": 1250.9791870117188, | |
| "epoch": 0.3977142857142857, | |
| "grad_norm": 1.317704439163208, | |
| "kl": 0.178863525390625, | |
| "learning_rate": 3.3046315338757026e-07, | |
| "loss": 0.0072, | |
| "reward": 0.746107667684555, | |
| "reward_std": 0.5483127310872078, | |
| "rewards/cosine_scaled_reward": -0.10611284070182592, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 348 | |
| }, | |
| { | |
| "completion_length": 1193.000015258789, | |
| "epoch": 0.39885714285714285, | |
| "grad_norm": 1.4739476442337036, | |
| "kl": 0.249725341796875, | |
| "learning_rate": 3.2772616003709616e-07, | |
| "loss": 0.01, | |
| "reward": 1.2662739604711533, | |
| "reward_std": 0.7532782405614853, | |
| "rewards/cosine_scaled_reward": 0.18522025644779205, | |
| "rewards/format_reward": 0.895833358168602, | |
| "step": 349 | |
| }, | |
| { | |
| "completion_length": 1177.3541870117188, | |
| "epoch": 0.4, | |
| "grad_norm": 2.2546327114105225, | |
| "kl": 0.24993896484375, | |
| "learning_rate": 3.250000000000001e-07, | |
| "loss": 0.01, | |
| "reward": 1.0857526510953903, | |
| "reward_std": 0.6757391728460789, | |
| "rewards/cosine_scaled_reward": 0.09495963307563215, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 350 | |
| }, | |
| { | |
| "completion_length": 1196.0000457763672, | |
| "epoch": 0.40114285714285713, | |
| "grad_norm": 1.1820554733276367, | |
| "kl": 0.15570068359375, | |
| "learning_rate": 3.222848061454764e-07, | |
| "loss": 0.0062, | |
| "reward": 0.5796034894883633, | |
| "reward_std": 0.5287084579467773, | |
| "rewards/cosine_scaled_reward": -0.178948275744915, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 351 | |
| }, | |
| { | |
| "completion_length": 1270.5416870117188, | |
| "epoch": 0.4022857142857143, | |
| "grad_norm": 1.5620218515396118, | |
| "kl": 0.12603759765625, | |
| "learning_rate": 3.195807108082429e-07, | |
| "loss": 0.005, | |
| "reward": 0.7529177367687225, | |
| "reward_std": 0.6913501024246216, | |
| "rewards/cosine_scaled_reward": -0.09229113161563873, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 352 | |
| }, | |
| { | |
| "completion_length": 1443.1666870117188, | |
| "epoch": 0.4034285714285714, | |
| "grad_norm": 2.1429245471954346, | |
| "kl": 0.18780517578125, | |
| "learning_rate": 3.168878457820915e-07, | |
| "loss": 0.0075, | |
| "reward": 0.9008820652961731, | |
| "reward_std": 0.6463882178068161, | |
| "rewards/cosine_scaled_reward": -0.007892303168773651, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 353 | |
| }, | |
| { | |
| "completion_length": 1230.6458587646484, | |
| "epoch": 0.4045714285714286, | |
| "grad_norm": 0.9503077268600464, | |
| "kl": 0.2082061767578125, | |
| "learning_rate": 3.142063423134644e-07, | |
| "loss": 0.0083, | |
| "reward": 0.4887809455394745, | |
| "reward_std": 0.4611463025212288, | |
| "rewards/cosine_scaled_reward": -0.23477619886398315, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 354 | |
| }, | |
| { | |
| "completion_length": 930.0625305175781, | |
| "epoch": 0.4057142857142857, | |
| "grad_norm": 0.8462334275245667, | |
| "kl": 0.1320648193359375, | |
| "learning_rate": 3.115363310950578e-07, | |
| "loss": 0.0053, | |
| "reward": 0.8714643996208906, | |
| "reward_std": 0.35595114156603813, | |
| "rewards/cosine_scaled_reward": -0.022601131349802017, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 355 | |
| }, | |
| { | |
| "completion_length": 1521.2500610351562, | |
| "epoch": 0.40685714285714286, | |
| "grad_norm": 3.6062703132629395, | |
| "kl": 0.31671142578125, | |
| "learning_rate": 3.0887794225945143e-07, | |
| "loss": 0.0126, | |
| "reward": 0.7512324824929237, | |
| "reward_std": 0.7443573772907257, | |
| "rewards/cosine_scaled_reward": -0.05146709643304348, | |
| "rewards/format_reward": 0.8541666865348816, | |
| "step": 356 | |
| }, | |
| { | |
| "completion_length": 1307.0625305175781, | |
| "epoch": 0.408, | |
| "grad_norm": 2.073002576828003, | |
| "kl": 0.2852783203125, | |
| "learning_rate": 3.062313053727671e-07, | |
| "loss": 0.0114, | |
| "reward": 0.6376607120037079, | |
| "reward_std": 0.5459994077682495, | |
| "rewards/cosine_scaled_reward": -0.13950299471616745, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 357 | |
| }, | |
| { | |
| "completion_length": 1298.2083587646484, | |
| "epoch": 0.40914285714285714, | |
| "grad_norm": 1.5467488765716553, | |
| "kl": 0.34130859375, | |
| "learning_rate": 3.0359654942835247e-07, | |
| "loss": 0.0136, | |
| "reward": 0.881379060447216, | |
| "reward_std": 0.8507445156574249, | |
| "rewards/cosine_scaled_reward": 0.003189507406204939, | |
| "rewards/format_reward": 0.875, | |
| "step": 358 | |
| }, | |
| { | |
| "completion_length": 1172.7916870117188, | |
| "epoch": 0.4102857142857143, | |
| "grad_norm": 1.671886682510376, | |
| "kl": 0.314453125, | |
| "learning_rate": 3.0097380284049523e-07, | |
| "loss": 0.0126, | |
| "reward": 0.7981488406658173, | |
| "reward_std": 0.5063908323645592, | |
| "rewards/cosine_scaled_reward": -0.04884226247668266, | |
| "rewards/format_reward": 0.895833358168602, | |
| "step": 359 | |
| }, | |
| { | |
| "completion_length": 990.9792175292969, | |
| "epoch": 0.4114285714285714, | |
| "grad_norm": 2.374612808227539, | |
| "kl": 0.41845703125, | |
| "learning_rate": 2.9836319343816397e-07, | |
| "loss": 0.0168, | |
| "reward": 1.1945213824510574, | |
| "reward_std": 0.7698078602552414, | |
| "rewards/cosine_scaled_reward": 0.11809402331709862, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 360 | |
| }, | |
| { | |
| "completion_length": 1524.3541870117188, | |
| "epoch": 0.4125714285714286, | |
| "grad_norm": 3.4913723468780518, | |
| "kl": 1.1523895263671875, | |
| "learning_rate": 2.9576484845877793e-07, | |
| "loss": 0.0461, | |
| "reward": 0.6581477224826813, | |
| "reward_std": 0.6756026446819305, | |
| "rewards/cosine_scaled_reward": -0.05634281662059948, | |
| "rewards/format_reward": 0.7708333432674408, | |
| "step": 361 | |
| }, | |
| { | |
| "completion_length": 1185.2708587646484, | |
| "epoch": 0.4137142857142857, | |
| "grad_norm": 34.620460510253906, | |
| "kl": 0.832366943359375, | |
| "learning_rate": 2.931788945420058e-07, | |
| "loss": 0.0334, | |
| "reward": 0.8447651118040085, | |
| "reward_std": 0.7907865196466446, | |
| "rewards/cosine_scaled_reward": -0.046367482747882605, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 362 | |
| }, | |
| { | |
| "completion_length": 1141.0208892822266, | |
| "epoch": 0.41485714285714287, | |
| "grad_norm": 2.2585678100585938, | |
| "kl": 0.352935791015625, | |
| "learning_rate": 2.9060545772359305e-07, | |
| "loss": 0.0141, | |
| "reward": 0.5232757963240147, | |
| "reward_std": 0.5396992526948452, | |
| "rewards/cosine_scaled_reward": -0.1966954478994012, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 363 | |
| }, | |
| { | |
| "completion_length": 1359.1667175292969, | |
| "epoch": 0.416, | |
| "grad_norm": 3.5956501960754395, | |
| "kl": 0.715576171875, | |
| "learning_rate": 2.8804466342921987e-07, | |
| "loss": 0.0286, | |
| "reward": 0.6145341023802757, | |
| "reward_std": 0.5419690161943436, | |
| "rewards/cosine_scaled_reward": -0.17189963907003403, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 364 | |
| }, | |
| { | |
| "completion_length": 1670.229248046875, | |
| "epoch": 0.41714285714285715, | |
| "grad_norm": 3.5323920249938965, | |
| "kl": 1.1943359375, | |
| "learning_rate": 2.854966364683872e-07, | |
| "loss": 0.0479, | |
| "reward": 0.40414058696478605, | |
| "reward_std": 0.691382423043251, | |
| "rewards/cosine_scaled_reward": -0.17292970418930054, | |
| "rewards/format_reward": 0.7500000074505806, | |
| "step": 365 | |
| }, | |
| { | |
| "completion_length": 1720.5000305175781, | |
| "epoch": 0.41828571428571426, | |
| "grad_norm": 3.34942364692688, | |
| "kl": 1.042236328125, | |
| "learning_rate": 2.829615010283344e-07, | |
| "loss": 0.0417, | |
| "reward": 0.6996188908815384, | |
| "reward_std": 0.7247656881809235, | |
| "rewards/cosine_scaled_reward": -0.0460239015519619, | |
| "rewards/format_reward": 0.791666679084301, | |
| "step": 366 | |
| }, | |
| { | |
| "completion_length": 927.6458587646484, | |
| "epoch": 0.41942857142857143, | |
| "grad_norm": 1.461083173751831, | |
| "kl": 0.38287353515625, | |
| "learning_rate": 2.8043938066798645e-07, | |
| "loss": 0.0153, | |
| "reward": 1.2472570352256298, | |
| "reward_std": 0.6670772060751915, | |
| "rewards/cosine_scaled_reward": 0.14446185529232025, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 367 | |
| }, | |
| { | |
| "completion_length": 1301.9792175292969, | |
| "epoch": 0.4205714285714286, | |
| "grad_norm": 2.861830711364746, | |
| "kl": 0.337615966796875, | |
| "learning_rate": 2.7793039831193133e-07, | |
| "loss": 0.0135, | |
| "reward": 1.2516463994979858, | |
| "reward_std": 1.0124248266220093, | |
| "rewards/cosine_scaled_reward": 0.17790652811527252, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 368 | |
| }, | |
| { | |
| "completion_length": 1215.5625, | |
| "epoch": 0.4217142857142857, | |
| "grad_norm": 1.6978230476379395, | |
| "kl": 0.335235595703125, | |
| "learning_rate": 2.7543467624442956e-07, | |
| "loss": 0.0134, | |
| "reward": 1.1675912141799927, | |
| "reward_std": 0.5593391507863998, | |
| "rewards/cosine_scaled_reward": 0.13587890937924385, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 369 | |
| }, | |
| { | |
| "completion_length": 1309.5208740234375, | |
| "epoch": 0.4228571428571429, | |
| "grad_norm": 2.2963132858276367, | |
| "kl": 0.4232177734375, | |
| "learning_rate": 2.729523361034538e-07, | |
| "loss": 0.0169, | |
| "reward": 0.5871782079339027, | |
| "reward_std": 0.778529703617096, | |
| "rewards/cosine_scaled_reward": -0.14391089417040348, | |
| "rewards/format_reward": 0.8750000298023224, | |
| "step": 370 | |
| }, | |
| { | |
| "completion_length": 1090.1042022705078, | |
| "epoch": 0.424, | |
| "grad_norm": 0.9345715045928955, | |
| "kl": 0.28076171875, | |
| "learning_rate": 2.7048349887476037e-07, | |
| "loss": 0.0112, | |
| "reward": 0.6981654912233353, | |
| "reward_std": 0.6373357623815536, | |
| "rewards/cosine_scaled_reward": -0.09883392881602049, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 371 | |
| }, | |
| { | |
| "completion_length": 1309.9375610351562, | |
| "epoch": 0.42514285714285716, | |
| "grad_norm": 0.7092093825340271, | |
| "kl": 0.486602783203125, | |
| "learning_rate": 2.6802828488599294e-07, | |
| "loss": 0.0194, | |
| "reward": 0.8811748586595058, | |
| "reward_std": 0.5044156312942505, | |
| "rewards/cosine_scaled_reward": -0.007329270243644714, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 372 | |
| }, | |
| { | |
| "completion_length": 1460.7708587646484, | |
| "epoch": 0.42628571428571427, | |
| "grad_norm": 2.416278839111328, | |
| "kl": 0.5400390625, | |
| "learning_rate": 2.655868138008171e-07, | |
| "loss": 0.0216, | |
| "reward": 0.9694189727306366, | |
| "reward_std": 0.6003681719303131, | |
| "rewards/cosine_scaled_reward": 0.0472094789147377, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 373 | |
| }, | |
| { | |
| "completion_length": 1132.5208587646484, | |
| "epoch": 0.42742857142857144, | |
| "grad_norm": 1.0274022817611694, | |
| "kl": 0.29541015625, | |
| "learning_rate": 2.631592046130896e-07, | |
| "loss": 0.0118, | |
| "reward": 1.1347306370735168, | |
| "reward_std": 0.5300607345998287, | |
| "rewards/cosine_scaled_reward": 0.07778198271989822, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 374 | |
| }, | |
| { | |
| "completion_length": 1205.6042175292969, | |
| "epoch": 0.42857142857142855, | |
| "grad_norm": 1.4747833013534546, | |
| "kl": 0.336669921875, | |
| "learning_rate": 2.6074557564105724e-07, | |
| "loss": 0.0135, | |
| "reward": 0.914167582988739, | |
| "reward_std": 0.7538186013698578, | |
| "rewards/cosine_scaled_reward": -0.011666236445307732, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 375 | |
| }, | |
| { | |
| "completion_length": 1323.9583740234375, | |
| "epoch": 0.4297142857142857, | |
| "grad_norm": 1.5346781015396118, | |
| "kl": 0.40960693359375, | |
| "learning_rate": 2.583460445215911e-07, | |
| "loss": 0.0164, | |
| "reward": 0.9741204902529716, | |
| "reward_std": 0.8010425865650177, | |
| "rewards/cosine_scaled_reward": 0.018310231156647205, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 376 | |
| }, | |
| { | |
| "completion_length": 1173.8333892822266, | |
| "epoch": 0.4308571428571429, | |
| "grad_norm": 1.3860251903533936, | |
| "kl": 0.32177734375, | |
| "learning_rate": 2.5596072820445254e-07, | |
| "loss": 0.0129, | |
| "reward": 1.1005370616912842, | |
| "reward_std": 0.8687849044799805, | |
| "rewards/cosine_scaled_reward": 0.11276852712035179, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 377 | |
| }, | |
| { | |
| "completion_length": 1077.0208587646484, | |
| "epoch": 0.432, | |
| "grad_norm": 1.6222304105758667, | |
| "kl": 0.325439453125, | |
| "learning_rate": 2.5358974294659373e-07, | |
| "loss": 0.013, | |
| "reward": 0.7557893544435501, | |
| "reward_std": 0.46596937626600266, | |
| "rewards/cosine_scaled_reward": -0.12210530787706375, | |
| "rewards/format_reward": 1.0, | |
| "step": 378 | |
| }, | |
| { | |
| "completion_length": 1240.3333740234375, | |
| "epoch": 0.43314285714285716, | |
| "grad_norm": 1.0828309059143066, | |
| "kl": 0.356201171875, | |
| "learning_rate": 2.512332043064913e-07, | |
| "loss": 0.0142, | |
| "reward": 0.8545220792293549, | |
| "reward_std": 0.6498586684465408, | |
| "rewards/cosine_scaled_reward": -0.05190563574433327, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 379 | |
| }, | |
| { | |
| "completion_length": 1270.6667175292969, | |
| "epoch": 0.4342857142857143, | |
| "grad_norm": 1.8103889226913452, | |
| "kl": 0.383026123046875, | |
| "learning_rate": 2.488912271385139e-07, | |
| "loss": 0.0153, | |
| "reward": 0.6777837425470352, | |
| "reward_std": 0.7819753885269165, | |
| "rewards/cosine_scaled_reward": -0.08819146640598774, | |
| "rewards/format_reward": 0.8541666716337204, | |
| "step": 380 | |
| }, | |
| { | |
| "completion_length": 1318.7292175292969, | |
| "epoch": 0.43542857142857144, | |
| "grad_norm": 1.7676132917404175, | |
| "kl": 0.482666015625, | |
| "learning_rate": 2.465639255873246e-07, | |
| "loss": 0.0193, | |
| "reward": 0.7255931571125984, | |
| "reward_std": 0.6526116281747818, | |
| "rewards/cosine_scaled_reward": -0.10595342982560396, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 381 | |
| }, | |
| { | |
| "completion_length": 1141.7917022705078, | |
| "epoch": 0.43657142857142855, | |
| "grad_norm": 1.832429051399231, | |
| "kl": 0.3201904296875, | |
| "learning_rate": 2.4425141308231765e-07, | |
| "loss": 0.0128, | |
| "reward": 0.7391386441886425, | |
| "reward_std": 0.7291494160890579, | |
| "rewards/cosine_scaled_reward": -0.07834736630320549, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 382 | |
| }, | |
| { | |
| "completion_length": 1615.541732788086, | |
| "epoch": 0.4377142857142857, | |
| "grad_norm": 2.4247608184814453, | |
| "kl": 1.1395721435546875, | |
| "learning_rate": 2.4195380233209006e-07, | |
| "loss": 0.0456, | |
| "reward": 0.8856848031282425, | |
| "reward_std": 0.5492001250386238, | |
| "rewards/cosine_scaled_reward": 0.005342394113540649, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 383 | |
| }, | |
| { | |
| "completion_length": 1421.0833435058594, | |
| "epoch": 0.43885714285714283, | |
| "grad_norm": 1.442662000656128, | |
| "kl": 0.374847412109375, | |
| "learning_rate": 2.3967120531894857e-07, | |
| "loss": 0.015, | |
| "reward": 0.32889158837497234, | |
| "reward_std": 0.4332951605319977, | |
| "rewards/cosine_scaled_reward": -0.26263754442334175, | |
| "rewards/format_reward": 0.8541666865348816, | |
| "step": 384 | |
| }, | |
| { | |
| "completion_length": 1334.5208435058594, | |
| "epoch": 0.44, | |
| "grad_norm": 1.4730134010314941, | |
| "kl": 0.422088623046875, | |
| "learning_rate": 2.374037332934512e-07, | |
| "loss": 0.0169, | |
| "reward": 0.8442478328943253, | |
| "reward_std": 0.6724153012037277, | |
| "rewards/cosine_scaled_reward": -0.05704276263713837, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 385 | |
| }, | |
| { | |
| "completion_length": 1111.4583740234375, | |
| "epoch": 0.44114285714285717, | |
| "grad_norm": 1.538129448890686, | |
| "kl": 0.181640625, | |
| "learning_rate": 2.3515149676898552e-07, | |
| "loss": 0.0073, | |
| "reward": 0.8161994330585003, | |
| "reward_std": 0.4277452155947685, | |
| "rewards/cosine_scaled_reward": -0.06065032631158829, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 386 | |
| }, | |
| { | |
| "completion_length": 1218.0208435058594, | |
| "epoch": 0.4422857142857143, | |
| "grad_norm": 1.626770257949829, | |
| "kl": 0.309906005859375, | |
| "learning_rate": 2.3291460551638237e-07, | |
| "loss": 0.0124, | |
| "reward": 0.6058969795703888, | |
| "reward_std": 0.7323561906814575, | |
| "rewards/cosine_scaled_reward": -0.1658015362918377, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 387 | |
| }, | |
| { | |
| "completion_length": 1258.187515258789, | |
| "epoch": 0.44342857142857145, | |
| "grad_norm": 1.7491554021835327, | |
| "kl": 0.3914794921875, | |
| "learning_rate": 2.306931685585657e-07, | |
| "loss": 0.0156, | |
| "reward": 0.7627854868769646, | |
| "reward_std": 0.7414772808551788, | |
| "rewards/cosine_scaled_reward": -0.05610728543251753, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 388 | |
| }, | |
| { | |
| "completion_length": 1268.4375305175781, | |
| "epoch": 0.44457142857142856, | |
| "grad_norm": 1.6758959293365479, | |
| "kl": 0.440826416015625, | |
| "learning_rate": 2.2848729416523859e-07, | |
| "loss": 0.0176, | |
| "reward": 1.1525626480579376, | |
| "reward_std": 0.7809968441724777, | |
| "rewards/cosine_scaled_reward": 0.12836465798318386, | |
| "rewards/format_reward": 0.895833358168602, | |
| "step": 389 | |
| }, | |
| { | |
| "completion_length": 940.9166870117188, | |
| "epoch": 0.44571428571428573, | |
| "grad_norm": 1.7198045253753662, | |
| "kl": 0.3330078125, | |
| "learning_rate": 2.2629708984760706e-07, | |
| "loss": 0.0133, | |
| "reward": 0.9264688044786453, | |
| "reward_std": 0.6931325197219849, | |
| "rewards/cosine_scaled_reward": -0.03676560753956437, | |
| "rewards/format_reward": 1.0, | |
| "step": 390 | |
| }, | |
| { | |
| "completion_length": 978.7291717529297, | |
| "epoch": 0.44685714285714284, | |
| "grad_norm": 0.6671801805496216, | |
| "kl": 0.029083251953125, | |
| "learning_rate": 2.2412266235313973e-07, | |
| "loss": 0.0012, | |
| "reward": 0.6134733706712723, | |
| "reward_std": 0.4728550612926483, | |
| "rewards/cosine_scaled_reward": -0.1932633202522993, | |
| "rewards/format_reward": 1.0, | |
| "step": 391 | |
| }, | |
| { | |
| "completion_length": 1309.5417175292969, | |
| "epoch": 0.448, | |
| "grad_norm": 2.3814332485198975, | |
| "kl": 0.251953125, | |
| "learning_rate": 2.2196411766036487e-07, | |
| "loss": 0.0101, | |
| "reward": 1.0050799548625946, | |
| "reward_std": 0.8785246461629868, | |
| "rewards/cosine_scaled_reward": 0.07545664254575968, | |
| "rewards/format_reward": 0.8541666865348816, | |
| "step": 392 | |
| }, | |
| { | |
| "completion_length": 1097.3125305175781, | |
| "epoch": 0.4491428571428571, | |
| "grad_norm": 1.722952961921692, | |
| "kl": 0.2423095703125, | |
| "learning_rate": 2.1982156097370557e-07, | |
| "loss": 0.0097, | |
| "reward": 0.832743689417839, | |
| "reward_std": 0.8567025661468506, | |
| "rewards/cosine_scaled_reward": -0.0419615093851462, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 393 | |
| }, | |
| { | |
| "completion_length": 1223.0833740234375, | |
| "epoch": 0.4502857142857143, | |
| "grad_norm": 2.990572452545166, | |
| "kl": 0.4295654296875, | |
| "learning_rate": 2.1769509671835223e-07, | |
| "loss": 0.0172, | |
| "reward": 0.5410304628312588, | |
| "reward_std": 0.6248824968934059, | |
| "rewards/cosine_scaled_reward": -0.13573478162288666, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 394 | |
| }, | |
| { | |
| "completion_length": 1189.500015258789, | |
| "epoch": 0.4514285714285714, | |
| "grad_norm": 2.629859685897827, | |
| "kl": 0.321044921875, | |
| "learning_rate": 2.1558482853517253e-07, | |
| "loss": 0.0128, | |
| "reward": 0.5656278505921364, | |
| "reward_std": 0.7980157136917114, | |
| "rewards/cosine_scaled_reward": -0.1130194254219532, | |
| "rewards/format_reward": 0.7916667014360428, | |
| "step": 395 | |
| }, | |
| { | |
| "completion_length": 1043.1458435058594, | |
| "epoch": 0.45257142857142857, | |
| "grad_norm": 1.0702786445617676, | |
| "kl": 0.0563507080078125, | |
| "learning_rate": 2.134908592756607e-07, | |
| "loss": 0.0023, | |
| "reward": 0.7221028283238411, | |
| "reward_std": 0.5050450935959816, | |
| "rewards/cosine_scaled_reward": -0.11811527609825134, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 396 | |
| }, | |
| { | |
| "completion_length": 1230.7292175292969, | |
| "epoch": 0.45371428571428574, | |
| "grad_norm": 3.2597548961639404, | |
| "kl": 0.392822265625, | |
| "learning_rate": 2.1141329099692406e-07, | |
| "loss": 0.0157, | |
| "reward": 1.312897451221943, | |
| "reward_std": 0.6480658948421478, | |
| "rewards/cosine_scaled_reward": 0.2293653730303049, | |
| "rewards/format_reward": 0.8541667014360428, | |
| "step": 397 | |
| }, | |
| { | |
| "completion_length": 1381.5625305175781, | |
| "epoch": 0.45485714285714285, | |
| "grad_norm": 1.6307339668273926, | |
| "kl": 0.20714569091796875, | |
| "learning_rate": 2.0935222495670968e-07, | |
| "loss": 0.0083, | |
| "reward": 0.7576578855514526, | |
| "reward_std": 0.7408057749271393, | |
| "rewards/cosine_scaled_reward": -0.10033774503972381, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 398 | |
| }, | |
| { | |
| "completion_length": 1487.0208740234375, | |
| "epoch": 0.456, | |
| "grad_norm": 2.9457855224609375, | |
| "kl": 0.426025390625, | |
| "learning_rate": 2.0730776160846853e-07, | |
| "loss": 0.0171, | |
| "reward": 0.6220748424530029, | |
| "reward_std": 0.5146789476275444, | |
| "rewards/cosine_scaled_reward": -0.09521258249878883, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 399 | |
| }, | |
| { | |
| "completion_length": 1041.7708587646484, | |
| "epoch": 0.45714285714285713, | |
| "grad_norm": 5.961241722106934, | |
| "kl": 0.289337158203125, | |
| "learning_rate": 2.0528000059645995e-07, | |
| "loss": 0.0116, | |
| "reward": 0.8267193324863911, | |
| "reward_std": 0.6178570240736008, | |
| "rewards/cosine_scaled_reward": -0.04497369006276131, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 400 | |
| }, | |
| { | |
| "completion_length": 965.6250305175781, | |
| "epoch": 0.4582857142857143, | |
| "grad_norm": 1.4047675132751465, | |
| "kl": 0.1659698486328125, | |
| "learning_rate": 2.032690407508949e-07, | |
| "loss": 0.0066, | |
| "reward": 0.982437789440155, | |
| "reward_std": 0.7459337636828423, | |
| "rewards/cosine_scaled_reward": 0.0537188770249486, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 401 | |
| }, | |
| { | |
| "completion_length": 1609.729248046875, | |
| "epoch": 0.4594285714285714, | |
| "grad_norm": 2.311453342437744, | |
| "kl": 0.615234375, | |
| "learning_rate": 2.0127498008311922e-07, | |
| "loss": 0.0246, | |
| "reward": 0.3766894303262234, | |
| "reward_std": 0.658165842294693, | |
| "rewards/cosine_scaled_reward": -0.2387386392802, | |
| "rewards/format_reward": 0.8541666865348816, | |
| "step": 402 | |
| }, | |
| { | |
| "completion_length": 1466.9375610351562, | |
| "epoch": 0.4605714285714286, | |
| "grad_norm": 2.0651133060455322, | |
| "kl": 0.7806396484375, | |
| "learning_rate": 1.9929791578083655e-07, | |
| "loss": 0.0312, | |
| "reward": 0.5563420876860619, | |
| "reward_std": 0.6794729232788086, | |
| "rewards/cosine_scaled_reward": -0.1489123017527163, | |
| "rewards/format_reward": 0.8541666865348816, | |
| "step": 403 | |
| }, | |
| { | |
| "completion_length": 1289.2708740234375, | |
| "epoch": 0.4617142857142857, | |
| "grad_norm": 2.5164127349853516, | |
| "kl": 0.3046875, | |
| "learning_rate": 1.9733794420337213e-07, | |
| "loss": 0.0122, | |
| "reward": 0.8672967702150345, | |
| "reward_std": 0.7338433116674423, | |
| "rewards/cosine_scaled_reward": -0.03510164050385356, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 404 | |
| }, | |
| { | |
| "completion_length": 1246.5833587646484, | |
| "epoch": 0.46285714285714286, | |
| "grad_norm": 1.7397247552871704, | |
| "kl": 0.14056396484375, | |
| "learning_rate": 1.9539516087697517e-07, | |
| "loss": 0.0056, | |
| "reward": 0.7972104996442795, | |
| "reward_std": 0.6757538244128227, | |
| "rewards/cosine_scaled_reward": 0.0027718953788280487, | |
| "rewards/format_reward": 0.7916667014360428, | |
| "step": 405 | |
| }, | |
| { | |
| "completion_length": 1569.5209045410156, | |
| "epoch": 0.464, | |
| "grad_norm": 3.1697490215301514, | |
| "kl": 1.05712890625, | |
| "learning_rate": 1.934696604901642e-07, | |
| "loss": 0.0423, | |
| "reward": 0.7342821173369884, | |
| "reward_std": 0.6933150887489319, | |
| "rewards/cosine_scaled_reward": -0.05994228646159172, | |
| "rewards/format_reward": 0.8541666716337204, | |
| "step": 406 | |
| }, | |
| { | |
| "completion_length": 970.0625457763672, | |
| "epoch": 0.46514285714285714, | |
| "grad_norm": 2.7586686611175537, | |
| "kl": 0.496337890625, | |
| "learning_rate": 1.915615368891117e-07, | |
| "loss": 0.0198, | |
| "reward": 0.7943353094160557, | |
| "reward_std": 0.5273813158273697, | |
| "rewards/cosine_scaled_reward": -0.07158234342932701, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 407 | |
| }, | |
| { | |
| "completion_length": 954.7083435058594, | |
| "epoch": 0.4662857142857143, | |
| "grad_norm": 1.7943812608718872, | |
| "kl": 0.168701171875, | |
| "learning_rate": 1.8967088307307e-07, | |
| "loss": 0.0067, | |
| "reward": 0.9154380261898041, | |
| "reward_std": 0.63597172498703, | |
| "rewards/cosine_scaled_reward": -0.02144765853881836, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 408 | |
| }, | |
| { | |
| "completion_length": 1268.2291717529297, | |
| "epoch": 0.4674285714285714, | |
| "grad_norm": 0.9967200756072998, | |
| "kl": 0.53662109375, | |
| "learning_rate": 1.8779779118983867e-07, | |
| "loss": 0.0215, | |
| "reward": 0.9619596749544144, | |
| "reward_std": 0.6098195463418961, | |
| "rewards/cosine_scaled_reward": 0.022646483033895493, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 409 | |
| }, | |
| { | |
| "completion_length": 1628.0000610351562, | |
| "epoch": 0.4685714285714286, | |
| "grad_norm": 5.845465660095215, | |
| "kl": 0.8017578125, | |
| "learning_rate": 1.8594235253127372e-07, | |
| "loss": 0.0321, | |
| "reward": 0.6380558162927628, | |
| "reward_std": 0.5308569446206093, | |
| "rewards/cosine_scaled_reward": -0.14972211251733825, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 410 | |
| }, | |
| { | |
| "completion_length": 1665.7083740234375, | |
| "epoch": 0.4697142857142857, | |
| "grad_norm": 1.869497537612915, | |
| "kl": 0.796875, | |
| "learning_rate": 1.8410465752883758e-07, | |
| "loss": 0.0319, | |
| "reward": 0.31100673973560333, | |
| "reward_std": 0.5528412610292435, | |
| "rewards/cosine_scaled_reward": -0.27157998457551, | |
| "rewards/format_reward": 0.8541666865348816, | |
| "step": 411 | |
| }, | |
| { | |
| "completion_length": 1114.604232788086, | |
| "epoch": 0.47085714285714286, | |
| "grad_norm": 1.987026333808899, | |
| "kl": 0.284912109375, | |
| "learning_rate": 1.822847957491922e-07, | |
| "loss": 0.0114, | |
| "reward": 0.7781837359070778, | |
| "reward_std": 0.5904239565134048, | |
| "rewards/cosine_scaled_reward": -0.09007478877902031, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 412 | |
| }, | |
| { | |
| "completion_length": 1371.5417175292969, | |
| "epoch": 0.472, | |
| "grad_norm": 2.023742198944092, | |
| "kl": 0.481689453125, | |
| "learning_rate": 1.804828558898332e-07, | |
| "loss": 0.0193, | |
| "reward": 0.7588743381202221, | |
| "reward_std": 0.7781134992837906, | |
| "rewards/cosine_scaled_reward": -0.06847950583323836, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 413 | |
| }, | |
| { | |
| "completion_length": 1710.7292175292969, | |
| "epoch": 0.47314285714285714, | |
| "grad_norm": 3.7236738204956055, | |
| "kl": 1.0467529296875, | |
| "learning_rate": 1.7869892577476722e-07, | |
| "loss": 0.042, | |
| "reward": 0.6331272795796394, | |
| "reward_std": 0.7422880977392197, | |
| "rewards/cosine_scaled_reward": -0.06885303813032806, | |
| "rewards/format_reward": 0.770833358168602, | |
| "step": 414 | |
| }, | |
| { | |
| "completion_length": 1409.2708740234375, | |
| "epoch": 0.4742857142857143, | |
| "grad_norm": 2.454448938369751, | |
| "kl": 0.506103515625, | |
| "learning_rate": 1.7693309235023127e-07, | |
| "loss": 0.0202, | |
| "reward": 0.9847299754619598, | |
| "reward_std": 0.6669813543558121, | |
| "rewards/cosine_scaled_reward": 0.023614969104528427, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 415 | |
| }, | |
| { | |
| "completion_length": 1729.4791870117188, | |
| "epoch": 0.4754285714285714, | |
| "grad_norm": 2.803572416305542, | |
| "kl": 0.96875, | |
| "learning_rate": 1.7518544168045524e-07, | |
| "loss": 0.0387, | |
| "reward": 0.4970005638897419, | |
| "reward_std": 0.8149459362030029, | |
| "rewards/cosine_scaled_reward": -0.10566640645265579, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 416 | |
| }, | |
| { | |
| "completion_length": 1810.1875457763672, | |
| "epoch": 0.4765714285714286, | |
| "grad_norm": 5.9517717361450195, | |
| "kl": 1.576904296875, | |
| "learning_rate": 1.7345605894346726e-07, | |
| "loss": 0.0632, | |
| "reward": 0.5241810567677021, | |
| "reward_std": 0.64292823523283, | |
| "rewards/cosine_scaled_reward": -0.08165947627276182, | |
| "rewards/format_reward": 0.6875000074505806, | |
| "step": 417 | |
| }, | |
| { | |
| "completion_length": 1492.8958740234375, | |
| "epoch": 0.4777142857142857, | |
| "grad_norm": 4.798679828643799, | |
| "kl": 0.8544921875, | |
| "learning_rate": 1.7174502842694212e-07, | |
| "loss": 0.0342, | |
| "reward": 0.4330623224377632, | |
| "reward_std": 0.5981302931904793, | |
| "rewards/cosine_scaled_reward": -0.22096885181963444, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 418 | |
| }, | |
| { | |
| "completion_length": 1555.2708435058594, | |
| "epoch": 0.47885714285714287, | |
| "grad_norm": 4.279539585113525, | |
| "kl": 0.86328125, | |
| "learning_rate": 1.7005243352409333e-07, | |
| "loss": 0.0346, | |
| "reward": 0.5117529258131981, | |
| "reward_std": 0.6584911718964577, | |
| "rewards/cosine_scaled_reward": -0.1399568784981966, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 419 | |
| }, | |
| { | |
| "completion_length": 1216.3542175292969, | |
| "epoch": 0.48, | |
| "grad_norm": 2.101778030395508, | |
| "kl": 0.41534423828125, | |
| "learning_rate": 1.6837835672960831e-07, | |
| "loss": 0.0167, | |
| "reward": 0.896861981600523, | |
| "reward_std": 0.856042891740799, | |
| "rewards/cosine_scaled_reward": 0.0005143135786056519, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 420 | |
| }, | |
| { | |
| "completion_length": 1162.3750457763672, | |
| "epoch": 0.48114285714285715, | |
| "grad_norm": 2.8854591846466064, | |
| "kl": 0.33807373046875, | |
| "learning_rate": 1.6672287963562852e-07, | |
| "loss": 0.0135, | |
| "reward": 0.8688920065760612, | |
| "reward_std": 0.7225290387868881, | |
| "rewards/cosine_scaled_reward": -0.013470660895109177, | |
| "rewards/format_reward": 0.895833358168602, | |
| "step": 421 | |
| }, | |
| { | |
| "completion_length": 1370.8541870117188, | |
| "epoch": 0.48228571428571426, | |
| "grad_norm": 1.7124054431915283, | |
| "kl": 0.51220703125, | |
| "learning_rate": 1.6508608292777203e-07, | |
| "loss": 0.0205, | |
| "reward": 0.7653668001294136, | |
| "reward_std": 0.7626539617776871, | |
| "rewards/cosine_scaled_reward": -0.07564992923289537, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 422 | |
| }, | |
| { | |
| "completion_length": 1183.708396911621, | |
| "epoch": 0.48342857142857143, | |
| "grad_norm": 1.421221137046814, | |
| "kl": 0.293243408203125, | |
| "learning_rate": 1.6346804638120098e-07, | |
| "loss": 0.0117, | |
| "reward": 0.9164831042289734, | |
| "reward_std": 0.533292543143034, | |
| "rewards/cosine_scaled_reward": -0.01050846092402935, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 423 | |
| }, | |
| { | |
| "completion_length": 1333.6250305175781, | |
| "epoch": 0.4845714285714286, | |
| "grad_norm": 2.020719289779663, | |
| "kl": 0.547119140625, | |
| "learning_rate": 1.6186884885673413e-07, | |
| "loss": 0.0219, | |
| "reward": 0.5903327092528343, | |
| "reward_std": 0.5696832239627838, | |
| "rewards/cosine_scaled_reward": -0.11108366213738918, | |
| "rewards/format_reward": 0.8125000298023224, | |
| "step": 424 | |
| }, | |
| { | |
| "completion_length": 1426.3542175292969, | |
| "epoch": 0.4857142857142857, | |
| "grad_norm": 2.2707631587982178, | |
| "kl": 0.79052734375, | |
| "learning_rate": 1.6028856829700258e-07, | |
| "loss": 0.0316, | |
| "reward": 0.5219599902629852, | |
| "reward_std": 0.5760443955659866, | |
| "rewards/cosine_scaled_reward": -0.14527002349495888, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 425 | |
| }, | |
| { | |
| "completion_length": 1259.3333740234375, | |
| "epoch": 0.4868571428571429, | |
| "grad_norm": 2.267915725708008, | |
| "kl": 0.395263671875, | |
| "learning_rate": 1.5872728172265146e-07, | |
| "loss": 0.0158, | |
| "reward": 0.8012158274650574, | |
| "reward_std": 0.667873740196228, | |
| "rewards/cosine_scaled_reward": -0.06814211048185825, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 426 | |
| }, | |
| { | |
| "completion_length": 1615.0416870117188, | |
| "epoch": 0.488, | |
| "grad_norm": 2.34485125541687, | |
| "kl": 0.85791015625, | |
| "learning_rate": 1.5718506522858572e-07, | |
| "loss": 0.0343, | |
| "reward": 0.7913745269179344, | |
| "reward_std": 0.7925111949443817, | |
| "rewards/cosine_scaled_reward": -0.00014608167111873627, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 427 | |
| }, | |
| { | |
| "completion_length": 1747.8333740234375, | |
| "epoch": 0.48914285714285716, | |
| "grad_norm": 2.9694297313690186, | |
| "kl": 0.9736328125, | |
| "learning_rate": 1.5566199398026147e-07, | |
| "loss": 0.0389, | |
| "reward": 0.55228191614151, | |
| "reward_std": 0.6319544315338135, | |
| "rewards/cosine_scaled_reward": -0.1509423702955246, | |
| "rewards/format_reward": 0.8541666865348816, | |
| "step": 428 | |
| }, | |
| { | |
| "completion_length": 1631.5417175292969, | |
| "epoch": 0.49028571428571427, | |
| "grad_norm": 3.960716724395752, | |
| "kl": 1.03509521484375, | |
| "learning_rate": 1.5415814221002265e-07, | |
| "loss": 0.0414, | |
| "reward": 0.8934581205248833, | |
| "reward_std": 0.5868038833141327, | |
| "rewards/cosine_scaled_reward": 0.03006240352988243, | |
| "rewards/format_reward": 0.833333358168602, | |
| "step": 429 | |
| }, | |
| { | |
| "completion_length": 1621.4375610351562, | |
| "epoch": 0.49142857142857144, | |
| "grad_norm": 8.593777656555176, | |
| "kl": 1.48388671875, | |
| "learning_rate": 1.5267358321348285e-07, | |
| "loss": 0.0594, | |
| "reward": 0.48539859987795353, | |
| "reward_std": 0.7700510919094086, | |
| "rewards/cosine_scaled_reward": -0.12188405683264136, | |
| "rewards/format_reward": 0.7291666716337204, | |
| "step": 430 | |
| }, | |
| { | |
| "completion_length": 1201.0208587646484, | |
| "epoch": 0.49257142857142855, | |
| "grad_norm": 1.7287312746047974, | |
| "kl": 0.67041015625, | |
| "learning_rate": 1.5120838934595337e-07, | |
| "loss": 0.0268, | |
| "reward": 0.9445644542574883, | |
| "reward_std": 0.5076880529522896, | |
| "rewards/cosine_scaled_reward": 0.0035321786999702454, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 431 | |
| }, | |
| { | |
| "completion_length": 1444.2084045410156, | |
| "epoch": 0.4937142857142857, | |
| "grad_norm": 3.417004346847534, | |
| "kl": 0.6171875, | |
| "learning_rate": 1.4976263201891613e-07, | |
| "loss": 0.0247, | |
| "reward": 0.6378493383526802, | |
| "reward_std": 0.7811735272407532, | |
| "rewards/cosine_scaled_reward": -0.09774199151434004, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 432 | |
| }, | |
| { | |
| "completion_length": 1373.5417175292969, | |
| "epoch": 0.4948571428571429, | |
| "grad_norm": 1.7710994482040405, | |
| "kl": 0.712890625, | |
| "learning_rate": 1.483363816965435e-07, | |
| "loss": 0.0286, | |
| "reward": 0.6063527911901474, | |
| "reward_std": 0.6604797914624214, | |
| "rewards/cosine_scaled_reward": -0.144740279763937, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 433 | |
| }, | |
| { | |
| "completion_length": 977.5208435058594, | |
| "epoch": 0.496, | |
| "grad_norm": 1.736846685409546, | |
| "kl": 0.294708251953125, | |
| "learning_rate": 1.469297078922642e-07, | |
| "loss": 0.0118, | |
| "reward": 1.2148061096668243, | |
| "reward_std": 0.5557400360703468, | |
| "rewards/cosine_scaled_reward": 0.11781970039010048, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 434 | |
| }, | |
| { | |
| "completion_length": 1433.8542175292969, | |
| "epoch": 0.49714285714285716, | |
| "grad_norm": 2.2994189262390137, | |
| "kl": 1.136810302734375, | |
| "learning_rate": 1.4554267916537495e-07, | |
| "loss": 0.0454, | |
| "reward": 0.4324219524860382, | |
| "reward_std": 0.8103198558092117, | |
| "rewards/cosine_scaled_reward": -0.1900390349328518, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 435 | |
| }, | |
| { | |
| "completion_length": 1499.3750305175781, | |
| "epoch": 0.4982857142857143, | |
| "grad_norm": 2.9299046993255615, | |
| "kl": 0.6876220703125, | |
| "learning_rate": 1.4417536311769885e-07, | |
| "loss": 0.0275, | |
| "reward": 0.6081612259149551, | |
| "reward_std": 0.8525863587856293, | |
| "rewards/cosine_scaled_reward": -0.1334194028750062, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 436 | |
| }, | |
| { | |
| "completion_length": 1131.1458892822266, | |
| "epoch": 0.49942857142857144, | |
| "grad_norm": 1.970673680305481, | |
| "kl": 0.24951171875, | |
| "learning_rate": 1.4282782639029128e-07, | |
| "loss": 0.01, | |
| "reward": 0.8882552683353424, | |
| "reward_std": 0.7682344913482666, | |
| "rewards/cosine_scaled_reward": -0.02462236536666751, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 437 | |
| }, | |
| { | |
| "completion_length": 1025.5208740234375, | |
| "epoch": 0.5005714285714286, | |
| "grad_norm": 1.1531387567520142, | |
| "kl": 0.20703125, | |
| "learning_rate": 1.4150013466019114e-07, | |
| "loss": 0.0083, | |
| "reward": 0.9214688986539841, | |
| "reward_std": 0.6478733271360397, | |
| "rewards/cosine_scaled_reward": -0.03926557023078203, | |
| "rewards/format_reward": 1.0, | |
| "step": 438 | |
| }, | |
| { | |
| "completion_length": 1004.7708740234375, | |
| "epoch": 0.5017142857142857, | |
| "grad_norm": 1.5415436029434204, | |
| "kl": 0.22320556640625, | |
| "learning_rate": 1.4019235263722034e-07, | |
| "loss": 0.0089, | |
| "reward": 0.6858842521905899, | |
| "reward_std": 0.581585705280304, | |
| "rewards/cosine_scaled_reward": -0.1466412227600813, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 439 | |
| }, | |
| { | |
| "completion_length": 1667.5000610351562, | |
| "epoch": 0.5028571428571429, | |
| "grad_norm": 1.8554582595825195, | |
| "kl": 0.833984375, | |
| "learning_rate": 1.3890454406082956e-07, | |
| "loss": 0.0334, | |
| "reward": 0.6555005759000778, | |
| "reward_std": 0.8219043761491776, | |
| "rewards/cosine_scaled_reward": -0.0784997058508452, | |
| "rewards/format_reward": 0.8125000298023224, | |
| "step": 440 | |
| }, | |
| { | |
| "completion_length": 1507.8542175292969, | |
| "epoch": 0.504, | |
| "grad_norm": 3.88932466506958, | |
| "kl": 0.70849609375, | |
| "learning_rate": 1.3763677169699217e-07, | |
| "loss": 0.0284, | |
| "reward": 0.7302692234516144, | |
| "reward_std": 0.6611650586128235, | |
| "rewards/cosine_scaled_reward": -0.07236538827419281, | |
| "rewards/format_reward": 0.8750000298023224, | |
| "step": 441 | |
| }, | |
| { | |
| "completion_length": 1471.1250305175781, | |
| "epoch": 0.5051428571428571, | |
| "grad_norm": 3.0517144203186035, | |
| "kl": 0.61572265625, | |
| "learning_rate": 1.3638909733514452e-07, | |
| "loss": 0.0247, | |
| "reward": 0.6592699624598026, | |
| "reward_std": 0.7461230009794235, | |
| "rewards/cosine_scaled_reward": -0.11828170018270612, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 442 | |
| }, | |
| { | |
| "completion_length": 1453.1458740234375, | |
| "epoch": 0.5062857142857143, | |
| "grad_norm": 2.053025722503662, | |
| "kl": 0.724609375, | |
| "learning_rate": 1.351615817851748e-07, | |
| "loss": 0.029, | |
| "reward": 0.5722346976399422, | |
| "reward_std": 0.643949382007122, | |
| "rewards/cosine_scaled_reward": -0.1305493265390396, | |
| "rewards/format_reward": 0.833333358168602, | |
| "step": 443 | |
| }, | |
| { | |
| "completion_length": 1375.6875305175781, | |
| "epoch": 0.5074285714285715, | |
| "grad_norm": 1.6717604398727417, | |
| "kl": 0.594390869140625, | |
| "learning_rate": 1.3395428487445914e-07, | |
| "loss": 0.0237, | |
| "reward": 1.0815926790237427, | |
| "reward_std": 0.7383088618516922, | |
| "rewards/cosine_scaled_reward": 0.08246299810707569, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 444 | |
| }, | |
| { | |
| "completion_length": 1785.2708740234375, | |
| "epoch": 0.5085714285714286, | |
| "grad_norm": 3.6348721981048584, | |
| "kl": 1.021484375, | |
| "learning_rate": 1.3276726544494571e-07, | |
| "loss": 0.0408, | |
| "reward": 0.6278277039527893, | |
| "reward_std": 0.6637818515300751, | |
| "rewards/cosine_scaled_reward": -0.10275283083319664, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 445 | |
| }, | |
| { | |
| "completion_length": 1080.1458587646484, | |
| "epoch": 0.5097142857142857, | |
| "grad_norm": 1.7892571687698364, | |
| "kl": 0.1240234375, | |
| "learning_rate": 1.316005813502869e-07, | |
| "loss": 0.005, | |
| "reward": 0.8095807060599327, | |
| "reward_std": 0.6449108496308327, | |
| "rewards/cosine_scaled_reward": -0.053542979061603546, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 446 | |
| }, | |
| { | |
| "completion_length": 1139.9166870117188, | |
| "epoch": 0.5108571428571429, | |
| "grad_norm": 1.82309091091156, | |
| "kl": 0.305145263671875, | |
| "learning_rate": 1.3045428945301953e-07, | |
| "loss": 0.0122, | |
| "reward": 0.9205853343009949, | |
| "reward_std": 0.7828942686319351, | |
| "rewards/cosine_scaled_reward": -0.02929066913202405, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 447 | |
| }, | |
| { | |
| "completion_length": 1278.1250305175781, | |
| "epoch": 0.512, | |
| "grad_norm": 2.259813070297241, | |
| "kl": 0.251708984375, | |
| "learning_rate": 1.2932844562179352e-07, | |
| "loss": 0.0101, | |
| "reward": 0.8140432685613632, | |
| "reward_std": 0.658391922712326, | |
| "rewards/cosine_scaled_reward": -0.05131170805543661, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 448 | |
| }, | |
| { | |
| "completion_length": 1173.3125305175781, | |
| "epoch": 0.5131428571428571, | |
| "grad_norm": 1.361348032951355, | |
| "kl": 0.3782958984375, | |
| "learning_rate": 1.2822310472864885e-07, | |
| "loss": 0.0151, | |
| "reward": 0.9565356224775314, | |
| "reward_std": 0.7290000915527344, | |
| "rewards/cosine_scaled_reward": 0.019934438169002533, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 449 | |
| }, | |
| { | |
| "completion_length": 1371.1667175292969, | |
| "epoch": 0.5142857142857142, | |
| "grad_norm": 3.834918737411499, | |
| "kl": 0.513427734375, | |
| "learning_rate": 1.2713832064634125e-07, | |
| "loss": 0.0205, | |
| "reward": 0.7457224242389202, | |
| "reward_std": 0.5297245979309082, | |
| "rewards/cosine_scaled_reward": -0.04380548745393753, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 450 | |
| }, | |
| { | |
| "completion_length": 1255.6250305175781, | |
| "epoch": 0.5154285714285715, | |
| "grad_norm": 2.335192918777466, | |
| "kl": 0.6416015625, | |
| "learning_rate": 1.260741462457165e-07, | |
| "loss": 0.0257, | |
| "reward": 1.0948645919561386, | |
| "reward_std": 0.7944803088903427, | |
| "rewards/cosine_scaled_reward": 0.10993227222934365, | |
| "rewards/format_reward": 0.8750000298023224, | |
| "step": 451 | |
| }, | |
| { | |
| "completion_length": 1190.6666870117188, | |
| "epoch": 0.5165714285714286, | |
| "grad_norm": 1.7608739137649536, | |
| "kl": 0.500244140625, | |
| "learning_rate": 1.2503063339313356e-07, | |
| "loss": 0.02, | |
| "reward": 0.5270581915974617, | |
| "reward_std": 0.6462119966745377, | |
| "rewards/cosine_scaled_reward": -0.1843875776976347, | |
| "rewards/format_reward": 0.895833358168602, | |
| "step": 452 | |
| }, | |
| { | |
| "completion_length": 1610.3125610351562, | |
| "epoch": 0.5177142857142857, | |
| "grad_norm": 2.1905384063720703, | |
| "kl": 0.96484375, | |
| "learning_rate": 1.2400783294793668e-07, | |
| "loss": 0.0386, | |
| "reward": 0.5264739021658897, | |
| "reward_std": 0.707670621573925, | |
| "rewards/cosine_scaled_reward": -0.1325964080169797, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 453 | |
| }, | |
| { | |
| "completion_length": 1178.0000305175781, | |
| "epoch": 0.5188571428571429, | |
| "grad_norm": 1.7162837982177734, | |
| "kl": 0.55322265625, | |
| "learning_rate": 1.2300579475997657e-07, | |
| "loss": 0.0221, | |
| "reward": 0.7971140295267105, | |
| "reward_std": 0.5913056135177612, | |
| "rewards/cosine_scaled_reward": -0.059776326175779104, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 454 | |
| }, | |
| { | |
| "completion_length": 1480.6250305175781, | |
| "epoch": 0.52, | |
| "grad_norm": 4.159846305847168, | |
| "kl": 1.05126953125, | |
| "learning_rate": 1.220245676671809e-07, | |
| "loss": 0.0421, | |
| "reward": 0.6366595476865768, | |
| "reward_std": 0.44621995836496353, | |
| "rewards/cosine_scaled_reward": -0.11917022056877613, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 455 | |
| }, | |
| { | |
| "completion_length": 1499.1458740234375, | |
| "epoch": 0.5211428571428571, | |
| "grad_norm": 3.277935743331909, | |
| "kl": 0.708251953125, | |
| "learning_rate": 1.2106419949317388e-07, | |
| "loss": 0.0284, | |
| "reward": 0.7240877486765385, | |
| "reward_std": 0.5471076965332031, | |
| "rewards/cosine_scaled_reward": -0.05462279508356005, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 456 | |
| }, | |
| { | |
| "completion_length": 1379.7292175292969, | |
| "epoch": 0.5222857142857142, | |
| "grad_norm": 2.5969083309173584, | |
| "kl": 0.6796875, | |
| "learning_rate": 1.2012473704494537e-07, | |
| "loss": 0.0272, | |
| "reward": 0.869850842282176, | |
| "reward_std": 0.700515478849411, | |
| "rewards/cosine_scaled_reward": 0.02867540717124939, | |
| "rewards/format_reward": 0.8125000298023224, | |
| "step": 457 | |
| }, | |
| { | |
| "completion_length": 1159.9791870117188, | |
| "epoch": 0.5234285714285715, | |
| "grad_norm": 2.355196952819824, | |
| "kl": 0.31744384765625, | |
| "learning_rate": 1.1920622611056974e-07, | |
| "loss": 0.0127, | |
| "reward": 0.923922210931778, | |
| "reward_std": 0.5514720380306244, | |
| "rewards/cosine_scaled_reward": -0.017205584794282913, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 458 | |
| }, | |
| { | |
| "completion_length": 1273.5833892822266, | |
| "epoch": 0.5245714285714286, | |
| "grad_norm": 4.148768901824951, | |
| "kl": 0.59765625, | |
| "learning_rate": 1.1830871145697412e-07, | |
| "loss": 0.0239, | |
| "reward": 0.574098750948906, | |
| "reward_std": 0.7631915658712387, | |
| "rewards/cosine_scaled_reward": -0.16086730360984802, | |
| "rewards/format_reward": 0.895833358168602, | |
| "step": 459 | |
| }, | |
| { | |
| "completion_length": 1453.9792175292969, | |
| "epoch": 0.5257142857142857, | |
| "grad_norm": 3.0608623027801514, | |
| "kl": 0.8349609375, | |
| "learning_rate": 1.1743223682775649e-07, | |
| "loss": 0.0335, | |
| "reward": 1.0540032014250755, | |
| "reward_std": 0.7098206132650375, | |
| "rewards/cosine_scaled_reward": 0.07908494677394629, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 460 | |
| }, | |
| { | |
| "completion_length": 1280.1667175292969, | |
| "epoch": 0.5268571428571428, | |
| "grad_norm": 1.6031488180160522, | |
| "kl": 0.51483154296875, | |
| "learning_rate": 1.1657684494105386e-07, | |
| "loss": 0.0206, | |
| "reward": 1.0345441699028015, | |
| "reward_std": 0.464998334646225, | |
| "rewards/cosine_scaled_reward": 0.05893874540925026, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 461 | |
| }, | |
| { | |
| "completion_length": 1348.2083740234375, | |
| "epoch": 0.528, | |
| "grad_norm": 2.087766647338867, | |
| "kl": 0.288330078125, | |
| "learning_rate": 1.1574257748745986e-07, | |
| "loss": 0.0115, | |
| "reward": 0.5173667185008526, | |
| "reward_std": 0.5679958164691925, | |
| "rewards/cosine_scaled_reward": -0.18923332425765693, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 462 | |
| }, | |
| { | |
| "completion_length": 1184.8333892822266, | |
| "epoch": 0.5291428571428571, | |
| "grad_norm": 2.3618013858795166, | |
| "kl": 0.67828369140625, | |
| "learning_rate": 1.1492947512799328e-07, | |
| "loss": 0.027, | |
| "reward": 0.8587629348039627, | |
| "reward_std": 0.5902151763439178, | |
| "rewards/cosine_scaled_reward": 0.002298124134540558, | |
| "rewards/format_reward": 0.8541666865348816, | |
| "step": 463 | |
| }, | |
| { | |
| "completion_length": 1381.3750610351562, | |
| "epoch": 0.5302857142857142, | |
| "grad_norm": 2.5598762035369873, | |
| "kl": 0.4290771484375, | |
| "learning_rate": 1.1413757749211602e-07, | |
| "loss": 0.0172, | |
| "reward": 0.8569861799478531, | |
| "reward_std": 0.5644854605197906, | |
| "rewards/cosine_scaled_reward": -0.04025692865252495, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 464 | |
| }, | |
| { | |
| "completion_length": 1754.0000610351562, | |
| "epoch": 0.5314285714285715, | |
| "grad_norm": 3.649667739868164, | |
| "kl": 1.2236328125, | |
| "learning_rate": 1.1336692317580158e-07, | |
| "loss": 0.049, | |
| "reward": 0.6691107526421547, | |
| "reward_std": 0.7533622533082962, | |
| "rewards/cosine_scaled_reward": -0.050861308351159096, | |
| "rewards/format_reward": 0.770833358168602, | |
| "step": 465 | |
| }, | |
| { | |
| "completion_length": 1626.979248046875, | |
| "epoch": 0.5325714285714286, | |
| "grad_norm": 4.014570713043213, | |
| "kl": 1.1494140625, | |
| "learning_rate": 1.1261754973965422e-07, | |
| "loss": 0.046, | |
| "reward": 0.39536508079618216, | |
| "reward_std": 0.5428843200206757, | |
| "rewards/cosine_scaled_reward": -0.18773413076996803, | |
| "rewards/format_reward": 0.7708333432674408, | |
| "step": 466 | |
| }, | |
| { | |
| "completion_length": 1410.1875305175781, | |
| "epoch": 0.5337142857142857, | |
| "grad_norm": 6.032597064971924, | |
| "kl": 0.89404296875, | |
| "learning_rate": 1.1188949370707787e-07, | |
| "loss": 0.0358, | |
| "reward": 0.7157600894570351, | |
| "reward_std": 0.6754554212093353, | |
| "rewards/cosine_scaled_reward": -0.07961997389793396, | |
| "rewards/format_reward": 0.8750000298023224, | |
| "step": 467 | |
| }, | |
| { | |
| "completion_length": 1505.0208740234375, | |
| "epoch": 0.5348571428571428, | |
| "grad_norm": 3.0937063694000244, | |
| "kl": 1.083251953125, | |
| "learning_rate": 1.1118279056249653e-07, | |
| "loss": 0.0434, | |
| "reward": 1.0479508265852928, | |
| "reward_std": 0.7750177532434464, | |
| "rewards/cosine_scaled_reward": 0.08647541608661413, | |
| "rewards/format_reward": 0.8750000298023224, | |
| "step": 468 | |
| }, | |
| { | |
| "completion_length": 1116.0208435058594, | |
| "epoch": 0.536, | |
| "grad_norm": 2.7102174758911133, | |
| "kl": 0.68310546875, | |
| "learning_rate": 1.1049747474962444e-07, | |
| "loss": 0.0273, | |
| "reward": 0.8038362823426723, | |
| "reward_std": 0.7213334441184998, | |
| "rewards/cosine_scaled_reward": -0.04599856585264206, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 469 | |
| }, | |
| { | |
| "completion_length": 1321.1458740234375, | |
| "epoch": 0.5371428571428571, | |
| "grad_norm": 6.9517316818237305, | |
| "kl": 1.054931640625, | |
| "learning_rate": 1.0983357966978745e-07, | |
| "loss": 0.0422, | |
| "reward": 0.6858988218009472, | |
| "reward_std": 0.6414925083518028, | |
| "rewards/cosine_scaled_reward": -0.0424672719091177, | |
| "rewards/format_reward": 0.7708333432674408, | |
| "step": 470 | |
| }, | |
| { | |
| "completion_length": 1012.8125457763672, | |
| "epoch": 0.5382857142857143, | |
| "grad_norm": 15.012682914733887, | |
| "kl": 2.00054931640625, | |
| "learning_rate": 1.0919113768029517e-07, | |
| "loss": 0.0798, | |
| "reward": 0.7205886021256447, | |
| "reward_std": 0.697634182870388, | |
| "rewards/cosine_scaled_reward": -0.1084557194262743, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 471 | |
| }, | |
| { | |
| "completion_length": 1118.0000305175781, | |
| "epoch": 0.5394285714285715, | |
| "grad_norm": 1.921370506286621, | |
| "kl": 0.70703125, | |
| "learning_rate": 1.0857018009286381e-07, | |
| "loss": 0.0282, | |
| "reward": 0.8347459137439728, | |
| "reward_std": 0.7502800822257996, | |
| "rewards/cosine_scaled_reward": -0.03054371359758079, | |
| "rewards/format_reward": 0.895833358168602, | |
| "step": 472 | |
| }, | |
| { | |
| "completion_length": 1343.3333740234375, | |
| "epoch": 0.5405714285714286, | |
| "grad_norm": 1.8898627758026123, | |
| "kl": 0.716796875, | |
| "learning_rate": 1.0797073717209013e-07, | |
| "loss": 0.0287, | |
| "reward": 0.3787691295146942, | |
| "reward_std": 0.4741926044225693, | |
| "rewards/cosine_scaled_reward": -0.2585321292281151, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 473 | |
| }, | |
| { | |
| "completion_length": 1074.5417022705078, | |
| "epoch": 0.5417142857142857, | |
| "grad_norm": 1.3952125310897827, | |
| "kl": 0.275390625, | |
| "learning_rate": 1.0739283813397639e-07, | |
| "loss": 0.011, | |
| "reward": 1.0772456228733063, | |
| "reward_std": 0.8115750551223755, | |
| "rewards/cosine_scaled_reward": 0.09070614166557789, | |
| "rewards/format_reward": 0.895833358168602, | |
| "step": 474 | |
| }, | |
| { | |
| "completion_length": 1536.7708740234375, | |
| "epoch": 0.5428571428571428, | |
| "grad_norm": 1.9422410726547241, | |
| "kl": 0.89697265625, | |
| "learning_rate": 1.068365111445064e-07, | |
| "loss": 0.036, | |
| "reward": 0.7740568369626999, | |
| "reward_std": 0.7255712598562241, | |
| "rewards/cosine_scaled_reward": -0.06088825827464461, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 475 | |
| }, | |
| { | |
| "completion_length": 1238.6250305175781, | |
| "epoch": 0.544, | |
| "grad_norm": 1.791479468345642, | |
| "kl": 0.49365234375, | |
| "learning_rate": 1.063017833182728e-07, | |
| "loss": 0.0198, | |
| "reward": 1.1282098963856697, | |
| "reward_std": 0.7448728978633881, | |
| "rewards/cosine_scaled_reward": 0.1266049058176577, | |
| "rewards/format_reward": 0.8750000298023224, | |
| "step": 476 | |
| }, | |
| { | |
| "completion_length": 926.4167175292969, | |
| "epoch": 0.5451428571428572, | |
| "grad_norm": 1.0713694095611572, | |
| "kl": 0.20794677734375, | |
| "learning_rate": 1.0578868071715544e-07, | |
| "loss": 0.0083, | |
| "reward": 1.242761254310608, | |
| "reward_std": 0.48385217040777206, | |
| "rewards/cosine_scaled_reward": 0.13179726898670197, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 477 | |
| }, | |
| { | |
| "completion_length": 1389.8750457763672, | |
| "epoch": 0.5462857142857143, | |
| "grad_norm": 1.6024447679519653, | |
| "kl": 0.315399169921875, | |
| "learning_rate": 1.0529722834905125e-07, | |
| "loss": 0.0126, | |
| "reward": 0.5958605632185936, | |
| "reward_std": 0.5410900861024857, | |
| "rewards/cosine_scaled_reward": -0.17081973887979984, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 478 | |
| }, | |
| { | |
| "completion_length": 1060.0000305175781, | |
| "epoch": 0.5474285714285714, | |
| "grad_norm": 1.9674124717712402, | |
| "kl": 0.351318359375, | |
| "learning_rate": 1.0482745016665526e-07, | |
| "loss": 0.0141, | |
| "reward": 0.9066205322742462, | |
| "reward_std": 0.8546933829784393, | |
| "rewards/cosine_scaled_reward": -0.036273106932640076, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 479 | |
| }, | |
| { | |
| "completion_length": 1312.5208740234375, | |
| "epoch": 0.5485714285714286, | |
| "grad_norm": 1.8547768592834473, | |
| "kl": 0.3245697021484375, | |
| "learning_rate": 1.0437936906629334e-07, | |
| "loss": 0.013, | |
| "reward": 1.0583224594593048, | |
| "reward_std": 0.5566529557108879, | |
| "rewards/cosine_scaled_reward": 0.04999455437064171, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 480 | |
| }, | |
| { | |
| "completion_length": 1382.7500610351562, | |
| "epoch": 0.5497142857142857, | |
| "grad_norm": 1.5025635957717896, | |
| "kl": 0.718505859375, | |
| "learning_rate": 1.0395300688680625e-07, | |
| "loss": 0.0287, | |
| "reward": 0.6311604380607605, | |
| "reward_std": 0.6715650781989098, | |
| "rewards/cosine_scaled_reward": -0.1323364470154047, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 481 | |
| }, | |
| { | |
| "completion_length": 1061.8750305175781, | |
| "epoch": 0.5508571428571428, | |
| "grad_norm": 2.283987522125244, | |
| "kl": 0.368408203125, | |
| "learning_rate": 1.0354838440848501e-07, | |
| "loss": 0.0148, | |
| "reward": 0.9338921532034874, | |
| "reward_std": 0.663320891559124, | |
| "rewards/cosine_scaled_reward": -0.022637249901890755, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 482 | |
| }, | |
| { | |
| "completion_length": 1664.0000305175781, | |
| "epoch": 0.552, | |
| "grad_norm": 3.247492551803589, | |
| "kl": 1.43994140625, | |
| "learning_rate": 1.0316552135205837e-07, | |
| "loss": 0.0576, | |
| "reward": 0.4382214695215225, | |
| "reward_std": 0.6629593223333359, | |
| "rewards/cosine_scaled_reward": -0.1454725954681635, | |
| "rewards/format_reward": 0.7291666865348816, | |
| "step": 483 | |
| }, | |
| { | |
| "completion_length": 1166.9375457763672, | |
| "epoch": 0.5531428571428572, | |
| "grad_norm": 1.635184407234192, | |
| "kl": 0.59814453125, | |
| "learning_rate": 1.0280443637773163e-07, | |
| "loss": 0.0239, | |
| "reward": 1.209791585803032, | |
| "reward_std": 0.48193909227848053, | |
| "rewards/cosine_scaled_reward": 0.12572911009192467, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 484 | |
| }, | |
| { | |
| "completion_length": 1562.9167022705078, | |
| "epoch": 0.5542857142857143, | |
| "grad_norm": 1.5177675485610962, | |
| "kl": 1.32794189453125, | |
| "learning_rate": 1.0246514708427701e-07, | |
| "loss": 0.0532, | |
| "reward": 0.6973680332303047, | |
| "reward_std": 0.6229738295078278, | |
| "rewards/cosine_scaled_reward": -0.07839931827038527, | |
| "rewards/format_reward": 0.8541666716337204, | |
| "step": 485 | |
| }, | |
| { | |
| "completion_length": 1258.0416870117188, | |
| "epoch": 0.5554285714285714, | |
| "grad_norm": 1.6634312868118286, | |
| "kl": 0.598846435546875, | |
| "learning_rate": 1.0214767000817596e-07, | |
| "loss": 0.0239, | |
| "reward": 0.9936239048838615, | |
| "reward_std": 0.7174008414149284, | |
| "rewards/cosine_scaled_reward": 0.038478586822748184, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 486 | |
| }, | |
| { | |
| "completion_length": 1263.3125610351562, | |
| "epoch": 0.5565714285714286, | |
| "grad_norm": 3.9874322414398193, | |
| "kl": 0.56591796875, | |
| "learning_rate": 1.0185202062281336e-07, | |
| "loss": 0.0226, | |
| "reward": 0.9569487422704697, | |
| "reward_std": 0.776068776845932, | |
| "rewards/cosine_scaled_reward": 0.009724359028041363, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 487 | |
| }, | |
| { | |
| "completion_length": 1440.8333435058594, | |
| "epoch": 0.5577142857142857, | |
| "grad_norm": 3.2814691066741943, | |
| "kl": 0.67236328125, | |
| "learning_rate": 1.0157821333772304e-07, | |
| "loss": 0.0269, | |
| "reward": 0.4271909072995186, | |
| "reward_std": 0.5705131366848946, | |
| "rewards/cosine_scaled_reward": -0.2447379156947136, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 488 | |
| }, | |
| { | |
| "completion_length": 983.0625457763672, | |
| "epoch": 0.5588571428571428, | |
| "grad_norm": 1.347419261932373, | |
| "kl": 0.39642333984375, | |
| "learning_rate": 1.013262614978859e-07, | |
| "loss": 0.0158, | |
| "reward": 1.1842274367809296, | |
| "reward_std": 0.5909858122467995, | |
| "rewards/cosine_scaled_reward": 0.11294705420732498, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 489 | |
| }, | |
| { | |
| "completion_length": 1493.3958740234375, | |
| "epoch": 0.56, | |
| "grad_norm": 1.7785853147506714, | |
| "kl": 0.92236328125, | |
| "learning_rate": 1.0109617738307911e-07, | |
| "loss": 0.0368, | |
| "reward": 0.5224965363740921, | |
| "reward_std": 0.5668257884681225, | |
| "rewards/cosine_scaled_reward": -0.1450017336755991, | |
| "rewards/format_reward": 0.8125000298023224, | |
| "step": 490 | |
| }, | |
| { | |
| "completion_length": 1626.2916870117188, | |
| "epoch": 0.5611428571428572, | |
| "grad_norm": 1.94631028175354, | |
| "kl": 1.02734375, | |
| "learning_rate": 1.0088797220727779e-07, | |
| "loss": 0.0411, | |
| "reward": 0.6662824004888535, | |
| "reward_std": 0.6974209845066071, | |
| "rewards/cosine_scaled_reward": -0.07310881093144417, | |
| "rewards/format_reward": 0.8125000298023224, | |
| "step": 491 | |
| }, | |
| { | |
| "completion_length": 1049.5833740234375, | |
| "epoch": 0.5622857142857143, | |
| "grad_norm": 2.7264091968536377, | |
| "kl": 0.43701171875, | |
| "learning_rate": 1.0070165611810855e-07, | |
| "loss": 0.0175, | |
| "reward": 0.8967996649444103, | |
| "reward_std": 0.40792329236865044, | |
| "rewards/cosine_scaled_reward": -0.02035021036863327, | |
| "rewards/format_reward": 0.9375, | |
| "step": 492 | |
| }, | |
| { | |
| "completion_length": 1251.0000610351562, | |
| "epoch": 0.5634285714285714, | |
| "grad_norm": 1.9115831851959229, | |
| "kl": 0.535888671875, | |
| "learning_rate": 1.005372381963547e-07, | |
| "loss": 0.0215, | |
| "reward": 0.7575453743338585, | |
| "reward_std": 0.6094752550125122, | |
| "rewards/cosine_scaled_reward": -0.11081065610051155, | |
| "rewards/format_reward": 0.9791666716337204, | |
| "step": 493 | |
| }, | |
| { | |
| "completion_length": 1035.9375305175781, | |
| "epoch": 0.5645714285714286, | |
| "grad_norm": 3.7157654762268066, | |
| "kl": 0.62744140625, | |
| "learning_rate": 1.0039472645551372e-07, | |
| "loss": 0.0251, | |
| "reward": 0.49045146629214287, | |
| "reward_std": 0.4288835674524307, | |
| "rewards/cosine_scaled_reward": -0.21310760331107304, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 494 | |
| }, | |
| { | |
| "completion_length": 1336.2291870117188, | |
| "epoch": 0.5657142857142857, | |
| "grad_norm": 1.2452759742736816, | |
| "kl": 0.7421875, | |
| "learning_rate": 1.002741278414069e-07, | |
| "loss": 0.0296, | |
| "reward": 1.002578854560852, | |
| "reward_std": 0.5317458659410477, | |
| "rewards/cosine_scaled_reward": 0.05337274447083473, | |
| "rewards/format_reward": 0.895833358168602, | |
| "step": 495 | |
| }, | |
| { | |
| "completion_length": 1363.2500305175781, | |
| "epoch": 0.5668571428571428, | |
| "grad_norm": 1.7971941232681274, | |
| "kl": 0.7275390625, | |
| "learning_rate": 1.0017544823184055e-07, | |
| "loss": 0.0291, | |
| "reward": 1.0207700282335281, | |
| "reward_std": 0.8502290099859238, | |
| "rewards/cosine_scaled_reward": 0.09371834748890251, | |
| "rewards/format_reward": 0.833333358168602, | |
| "step": 496 | |
| }, | |
| { | |
| "completion_length": 1141.7500305175781, | |
| "epoch": 0.568, | |
| "grad_norm": 2.350130796432495, | |
| "kl": 0.56689453125, | |
| "learning_rate": 1.0009869243631952e-07, | |
| "loss": 0.0227, | |
| "reward": 1.0808594226837158, | |
| "reward_std": 0.7858606725931168, | |
| "rewards/cosine_scaled_reward": 0.14459637086838484, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 497 | |
| }, | |
| { | |
| "completion_length": 1354.0625610351562, | |
| "epoch": 0.5691428571428572, | |
| "grad_norm": 2.9282283782958984, | |
| "kl": 0.8212890625, | |
| "learning_rate": 1.000438641958131e-07, | |
| "loss": 0.0329, | |
| "reward": 0.7300854474306107, | |
| "reward_std": 0.6824060678482056, | |
| "rewards/cosine_scaled_reward": -0.08287395909428596, | |
| "rewards/format_reward": 0.895833358168602, | |
| "step": 498 | |
| }, | |
| { | |
| "completion_length": 1752.0625610351562, | |
| "epoch": 0.5702857142857143, | |
| "grad_norm": 1.9427164793014526, | |
| "kl": 0.99267578125, | |
| "learning_rate": 1.0001096618257236e-07, | |
| "loss": 0.0397, | |
| "reward": 0.6041746586561203, | |
| "reward_std": 0.5943188220262527, | |
| "rewards/cosine_scaled_reward": -0.12499601114541292, | |
| "rewards/format_reward": 0.8541666865348816, | |
| "step": 499 | |
| }, | |
| { | |
| "completion_length": 1389.7292175292969, | |
| "epoch": 0.5714285714285714, | |
| "grad_norm": 3.8614306449890137, | |
| "kl": 0.90966796875, | |
| "learning_rate": 1e-07, | |
| "loss": 0.0364, | |
| "reward": 0.8081704080104828, | |
| "reward_std": 0.7910114228725433, | |
| "rewards/cosine_scaled_reward": -0.022998109459877014, | |
| "rewards/format_reward": 0.8541666716337204, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.5714285714285714, | |
| "step": 500, | |
| "total_flos": 0.0, | |
| "train_loss": 0.007592763372914685, | |
| "train_runtime": 38120.128, | |
| "train_samples_per_second": 0.63, | |
| "train_steps_per_second": 0.013 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 500, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |