{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5714285714285714, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 3001.9584350585938, "epoch": 0.001142857142857143, "grad_norm": 0.11473917961120605, "kl": 0.0, "learning_rate": 2e-08, "loss": 0.034, "reward": -0.010712452232837677, "reward_std": 0.48354096710681915, "rewards/cosine_scaled_reward": -0.1928562317043543, "rewards/format_reward": 0.37500000558793545, "step": 1 }, { "completion_length": 2822.541717529297, "epoch": 0.002285714285714286, "grad_norm": 0.17855221033096313, "kl": 0.0, "learning_rate": 4e-08, "loss": 0.1095, "reward": 0.4385625521535985, "reward_std": 0.8208381980657578, "rewards/cosine_scaled_reward": -0.009885392151772976, "rewards/format_reward": 0.4583333432674408, "step": 2 }, { "completion_length": 2903.604248046875, "epoch": 0.0034285714285714284, "grad_norm": 0.05400172621011734, "kl": 3.629922866821289e-05, "learning_rate": 6e-08, "loss": 0.0166, "reward": -0.3212598990648985, "reward_std": 0.36036985367536545, "rewards/cosine_scaled_reward": -0.3168799467384815, "rewards/format_reward": 0.31250000186264515, "step": 3 }, { "completion_length": 2924.8958740234375, "epoch": 0.004571428571428572, "grad_norm": 0.1298418492078781, "kl": 3.390759229660034e-05, "learning_rate": 8e-08, "loss": 0.0193, "reward": 0.11002232693135738, "reward_std": 0.5668230727314949, "rewards/cosine_scaled_reward": -0.12207217514514923, "rewards/format_reward": 0.3541666865348816, "step": 4 }, { "completion_length": 2699.4793090820312, "epoch": 0.005714285714285714, "grad_norm": 0.11395805329084396, "kl": 2.8192996978759766e-05, "learning_rate": 1e-07, "loss": 0.0509, "reward": 0.5249291565269232, "reward_std": 0.7597299069166183, "rewards/cosine_scaled_reward": 0.033297897316515446, "rewards/format_reward": 0.4583333544433117, "step": 5 }, { "completion_length": 2660.5001220703125, "epoch": 0.006857142857142857, "grad_norm": 0.15824902057647705, "kl": 4.559755325317383e-05, "learning_rate": 1.2e-07, "loss": 0.04, "reward": 0.42945386096835136, "reward_std": 0.6760371923446655, "rewards/cosine_scaled_reward": -0.05610641464591026, "rewards/format_reward": 0.541666679084301, "step": 6 }, { "completion_length": 2458.479217529297, "epoch": 0.008, "grad_norm": 0.10866966843605042, "kl": 2.4110078811645508e-05, "learning_rate": 1.4e-07, "loss": 0.0529, "reward": 0.7580276802182198, "reward_std": 0.6385035738348961, "rewards/cosine_scaled_reward": 0.09776384383440018, "rewards/format_reward": 0.5625000149011612, "step": 7 }, { "completion_length": 2977.8126220703125, "epoch": 0.009142857142857144, "grad_norm": 0.22230574488639832, "kl": 3.574788570404053e-05, "learning_rate": 1.6e-07, "loss": 0.0993, "reward": 0.06304685212671757, "reward_std": 0.8850619196891785, "rewards/cosine_scaled_reward": -0.16639323788695037, "rewards/format_reward": 0.3958333395421505, "step": 8 }, { "completion_length": 3034.5416870117188, "epoch": 0.010285714285714285, "grad_norm": 0.17408320307731628, "kl": 3.820657730102539e-05, "learning_rate": 1.8e-07, "loss": 0.0851, "reward": 0.06854809075593948, "reward_std": 0.8176102936267853, "rewards/cosine_scaled_reward": -0.10114264115691185, "rewards/format_reward": 0.2708333432674408, "step": 9 }, { "completion_length": 2121.2500610351562, "epoch": 0.011428571428571429, "grad_norm": 0.07089601457118988, "kl": 2.7008354663848877e-05, "learning_rate": 2e-07, "loss": 0.026, "reward": 0.6565612219274044, "reward_std": 0.6731352433562279, "rewards/cosine_scaled_reward": 0.026197269558906555, "rewards/format_reward": 0.6041666716337204, "step": 10 }, { "completion_length": 2388.166748046875, "epoch": 0.012571428571428572, "grad_norm": 0.17368823289871216, "kl": 2.911686897277832e-05, "learning_rate": 2.1999999999999998e-07, "loss": 0.09, "reward": 0.7517527863383293, "reward_std": 1.0614946484565735, "rewards/cosine_scaled_reward": 0.07379304803907871, "rewards/format_reward": 0.6041666865348816, "step": 11 }, { "completion_length": 2672.5834350585938, "epoch": 0.013714285714285714, "grad_norm": 0.09804865717887878, "kl": 3.5643577575683594e-05, "learning_rate": 2.4e-07, "loss": 0.0423, "reward": 0.46549332328140736, "reward_std": 0.59340400993824, "rewards/cosine_scaled_reward": -0.006836682558059692, "rewards/format_reward": 0.479166679084301, "step": 12 }, { "completion_length": 2250.187530517578, "epoch": 0.014857142857142857, "grad_norm": 0.10080444812774658, "kl": 3.0308961868286133e-05, "learning_rate": 2.6e-07, "loss": 0.0188, "reward": 0.6889139215054456, "reward_std": 0.8085261583328247, "rewards/cosine_scaled_reward": 0.06320697697810829, "rewards/format_reward": 0.5625000111758709, "step": 13 }, { "completion_length": 2936.9375610351562, "epoch": 0.016, "grad_norm": 0.1032668873667717, "kl": 4.1931867599487305e-05, "learning_rate": 2.8e-07, "loss": 0.0156, "reward": 0.10788557305932045, "reward_std": 0.6920560002326965, "rewards/cosine_scaled_reward": -0.11272389208897948, "rewards/format_reward": 0.3333333432674408, "step": 14 }, { "completion_length": 3221.666748046875, "epoch": 0.017142857142857144, "grad_norm": 0.10653272271156311, "kl": 3.7223100662231445e-05, "learning_rate": 3e-07, "loss": 0.0008, "reward": -0.2332199066877365, "reward_std": 0.63228340446949, "rewards/cosine_scaled_reward": -0.21035997135186335, "rewards/format_reward": 0.1875000111758709, "step": 15 }, { "completion_length": 2321.3750610351562, "epoch": 0.018285714285714287, "grad_norm": 0.14373674988746643, "kl": 2.193450927734375e-05, "learning_rate": 3.2e-07, "loss": 0.0532, "reward": 0.6621312350034714, "reward_std": 0.9647989273071289, "rewards/cosine_scaled_reward": 0.06023227237164974, "rewards/format_reward": 0.5416666865348816, "step": 16 }, { "completion_length": 3174.8333740234375, "epoch": 0.019428571428571427, "grad_norm": 0.07878188043832779, "kl": 3.62396240234375e-05, "learning_rate": 3.4000000000000003e-07, "loss": 0.0239, "reward": -0.20133600383996964, "reward_std": 0.5479727387428284, "rewards/cosine_scaled_reward": -0.2152513451874256, "rewards/format_reward": 0.2291666679084301, "step": 17 }, { "completion_length": 3214.229248046875, "epoch": 0.02057142857142857, "grad_norm": 0.1723223179578781, "kl": 5.7220458984375e-05, "learning_rate": 3.6e-07, "loss": 0.0648, "reward": -0.21091226488351822, "reward_std": 0.5157570615410805, "rewards/cosine_scaled_reward": -0.188789464533329, "rewards/format_reward": 0.1666666679084301, "step": 18 }, { "completion_length": 3238.9584350585938, "epoch": 0.021714285714285714, "grad_norm": 0.161203071475029, "kl": 2.1696090698242188e-05, "learning_rate": 3.7999999999999996e-07, "loss": 0.0562, "reward": 0.049652623711153865, "reward_std": 0.9271627813577652, "rewards/cosine_scaled_reward": -0.1210070364177227, "rewards/format_reward": 0.2916666679084301, "step": 19 }, { "completion_length": 2502.9584045410156, "epoch": 0.022857142857142857, "grad_norm": 0.19064471125602722, "kl": 3.2901763916015625e-05, "learning_rate": 4e-07, "loss": 0.097, "reward": 0.33966562896966934, "reward_std": 0.6814321130514145, "rewards/cosine_scaled_reward": -0.10100051760673523, "rewards/format_reward": 0.5416666865348816, "step": 20 }, { "completion_length": 2544.5833740234375, "epoch": 0.024, "grad_norm": 0.08170344680547714, "kl": 2.512335777282715e-05, "learning_rate": 4.1999999999999995e-07, "loss": 0.0099, "reward": 0.26008715480566025, "reward_std": 0.5456661060452461, "rewards/cosine_scaled_reward": -0.06787310540676117, "rewards/format_reward": 0.3958333432674408, "step": 21 }, { "completion_length": 3508.8126220703125, "epoch": 0.025142857142857144, "grad_norm": 0.14452184736728668, "kl": 2.7313828468322754e-05, "learning_rate": 4.3999999999999997e-07, "loss": 0.0283, "reward": -0.03812084347009659, "reward_std": 0.7810813337564468, "rewards/cosine_scaled_reward": -0.10239375196397305, "rewards/format_reward": 0.16666666977107525, "step": 22 }, { "completion_length": 3135.5000610351562, "epoch": 0.026285714285714287, "grad_norm": 0.18309734761714935, "kl": 4.690885543823242e-05, "learning_rate": 4.6e-07, "loss": 0.0864, "reward": -0.03534786030650139, "reward_std": 0.8103697001934052, "rewards/cosine_scaled_reward": -0.17392393667250872, "rewards/format_reward": 0.3125000111758709, "step": 23 }, { "completion_length": 2123.3750915527344, "epoch": 0.027428571428571427, "grad_norm": 0.07949961721897125, "kl": 1.4767050743103027e-05, "learning_rate": 4.8e-07, "loss": 0.0269, "reward": 0.6402075001969934, "reward_std": 0.7203418090939522, "rewards/cosine_scaled_reward": 0.018020419403910637, "rewards/format_reward": 0.6041666679084301, "step": 24 }, { "completion_length": 2792.7709045410156, "epoch": 0.02857142857142857, "grad_norm": 0.09897608309984207, "kl": 1.7628073692321777e-05, "learning_rate": 5e-07, "loss": 0.0263, "reward": 0.3667532876133919, "reward_std": 0.5270465165376663, "rewards/cosine_scaled_reward": -0.03537335619330406, "rewards/format_reward": 0.43750000558793545, "step": 25 }, { "completion_length": 3103.5416870117188, "epoch": 0.029714285714285714, "grad_norm": 0.15197034180164337, "kl": 1.8015503883361816e-05, "learning_rate": 5.2e-07, "loss": 0.0517, "reward": 0.23722141981124878, "reward_std": 0.826317235827446, "rewards/cosine_scaled_reward": -0.027222641743719578, "rewards/format_reward": 0.29166667722165585, "step": 26 }, { "completion_length": 3099.729248046875, "epoch": 0.030857142857142857, "grad_norm": 0.11937292665243149, "kl": 2.5153160095214844e-05, "learning_rate": 5.4e-07, "loss": 0.0279, "reward": -0.05506348796188831, "reward_std": 0.483004167675972, "rewards/cosine_scaled_reward": -0.14211508259177208, "rewards/format_reward": 0.2291666716337204, "step": 27 }, { "completion_length": 3221.7291870117188, "epoch": 0.032, "grad_norm": 0.1231866255402565, "kl": 2.6211142539978027e-05, "learning_rate": 5.6e-07, "loss": -0.0031, "reward": 0.19264543801546097, "reward_std": 0.7934563755989075, "rewards/cosine_scaled_reward": -0.07034394145011902, "rewards/format_reward": 0.33333334885537624, "step": 28 }, { "completion_length": 3130.6459350585938, "epoch": 0.03314285714285714, "grad_norm": 0.14249049127101898, "kl": 2.726912498474121e-06, "learning_rate": 5.8e-07, "loss": 0.0394, "reward": 0.20082764513790607, "reward_std": 1.0230832546949387, "rewards/cosine_scaled_reward": -0.06625284859910607, "rewards/format_reward": 0.33333334513008595, "step": 29 }, { "completion_length": 3211.125, "epoch": 0.03428571428571429, "grad_norm": 0.11244227737188339, "kl": 2.047419548034668e-05, "learning_rate": 6e-07, "loss": 0.0135, "reward": 0.11087529244832695, "reward_std": 0.6219374239444733, "rewards/cosine_scaled_reward": -0.09039569273591042, "rewards/format_reward": 0.29166667722165585, "step": 30 }, { "completion_length": 2505.687530517578, "epoch": 0.03542857142857143, "grad_norm": 0.10730752348899841, "kl": 2.9802322387695312e-05, "learning_rate": 6.2e-07, "loss": 0.0711, "reward": 0.10028511472046375, "reward_std": 0.7022345140576363, "rewards/cosine_scaled_reward": -0.1686074547469616, "rewards/format_reward": 0.4375000074505806, "step": 31 }, { "completion_length": 3546.5, "epoch": 0.036571428571428574, "grad_norm": 0.08949411660432816, "kl": 2.053380012512207e-05, "learning_rate": 6.4e-07, "loss": 0.0081, "reward": -0.4492787718772888, "reward_std": 0.4731578528881073, "rewards/cosine_scaled_reward": -0.2454727292060852, "rewards/format_reward": 0.0416666679084301, "step": 32 }, { "completion_length": 3140.4584350585938, "epoch": 0.037714285714285714, "grad_norm": 0.15533116459846497, "kl": 1.6998499631881714e-05, "learning_rate": 6.6e-07, "loss": 0.0936, "reward": 0.14784683287143707, "reward_std": 0.8761000260710716, "rewards/cosine_scaled_reward": -0.10315992683172226, "rewards/format_reward": 0.3541666716337204, "step": 33 }, { "completion_length": 3067.5208740234375, "epoch": 0.038857142857142854, "grad_norm": 0.05691331624984741, "kl": 7.178634405136108e-06, "learning_rate": 6.800000000000001e-07, "loss": 0.0164, "reward": -0.4488837197422981, "reward_std": 0.4332050681114197, "rewards/cosine_scaled_reward": -0.31819187104701996, "rewards/format_reward": 0.1875, "step": 34 }, { "completion_length": 2977.979248046875, "epoch": 0.04, "grad_norm": 0.13275845348834991, "kl": 2.034008502960205e-05, "learning_rate": 7e-07, "loss": 0.0431, "reward": 0.19347557425498962, "reward_std": 0.7837567403912544, "rewards/cosine_scaled_reward": -0.11159555055201054, "rewards/format_reward": 0.4166666828095913, "step": 35 }, { "completion_length": 2511.7500610351562, "epoch": 0.04114285714285714, "grad_norm": 0.08902338147163391, "kl": 7.106363773345947e-05, "learning_rate": 7.2e-07, "loss": 0.0143, "reward": 0.6313629895448685, "reward_std": 0.4862937852740288, "rewards/cosine_scaled_reward": 0.06568148266524076, "rewards/format_reward": 0.5000000055879354, "step": 36 }, { "completion_length": 2623.6458435058594, "epoch": 0.04228571428571429, "grad_norm": 0.12060169875621796, "kl": 6.20037317276001e-05, "learning_rate": 7.4e-07, "loss": 0.0323, "reward": 0.4485716000199318, "reward_std": 0.8753202259540558, "rewards/cosine_scaled_reward": -0.0361308753490448, "rewards/format_reward": 0.5208333414047956, "step": 37 }, { "completion_length": 3015.5625610351562, "epoch": 0.04342857142857143, "grad_norm": 0.10110022872686386, "kl": 0.00016170740127563477, "learning_rate": 7.599999999999999e-07, "loss": 0.0344, "reward": -0.068646389991045, "reward_std": 0.6391054093837738, "rewards/cosine_scaled_reward": -0.22182317543774843, "rewards/format_reward": 0.37500002048909664, "step": 38 }, { "completion_length": 2867.5208740234375, "epoch": 0.044571428571428574, "grad_norm": 0.15215592086315155, "kl": 0.00011932849884033203, "learning_rate": 7.799999999999999e-07, "loss": 0.1002, "reward": 0.14817129005677998, "reward_std": 0.7805476784706116, "rewards/cosine_scaled_reward": -0.12383103743195534, "rewards/format_reward": 0.39583333395421505, "step": 39 }, { "completion_length": 3186.5000610351562, "epoch": 0.045714285714285714, "grad_norm": 0.11930648982524872, "kl": 0.00010547041893005371, "learning_rate": 8e-07, "loss": 0.0178, "reward": -0.03248624689877033, "reward_std": 0.63504558801651, "rewards/cosine_scaled_reward": -0.16207645926624537, "rewards/format_reward": 0.29166666977107525, "step": 40 }, { "completion_length": 3180.8959350585938, "epoch": 0.046857142857142854, "grad_norm": 0.18630492687225342, "kl": 3.663450479507446e-05, "learning_rate": 8.199999999999999e-07, "loss": 0.0553, "reward": 0.43380990624427795, "reward_std": 0.8565632924437523, "rewards/cosine_scaled_reward": 0.018988274037837982, "rewards/format_reward": 0.3958333432674408, "step": 41 }, { "completion_length": 2079.1041870117188, "epoch": 0.048, "grad_norm": 0.11225883662700653, "kl": 0.0004626065492630005, "learning_rate": 8.399999999999999e-07, "loss": 0.0429, "reward": 0.8895847648382187, "reward_std": 0.764504998922348, "rewards/cosine_scaled_reward": 0.11145903076976538, "rewards/format_reward": 0.6666666828095913, "step": 42 }, { "completion_length": 3000.166748046875, "epoch": 0.04914285714285714, "grad_norm": 0.24759933352470398, "kl": 0.00012095272541046143, "learning_rate": 8.599999999999999e-07, "loss": 0.064, "reward": 0.32910796254873276, "reward_std": 1.0378518775105476, "rewards/cosine_scaled_reward": -0.03336267964914441, "rewards/format_reward": 0.39583333395421505, "step": 43 }, { "completion_length": 2956.9375610351562, "epoch": 0.05028571428571429, "grad_norm": 0.28840357065200806, "kl": 0.0008223056793212891, "learning_rate": 8.799999999999999e-07, "loss": 0.0814, "reward": 0.2166026197373867, "reward_std": 0.745319314301014, "rewards/cosine_scaled_reward": -0.10003203712403774, "rewards/format_reward": 0.4166666828095913, "step": 44 }, { "completion_length": 2793.9583740234375, "epoch": 0.05142857142857143, "grad_norm": 0.1415959894657135, "kl": 6.61015510559082e-05, "learning_rate": 9e-07, "loss": 0.0791, "reward": 0.6622170452028513, "reward_std": 0.8223324418067932, "rewards/cosine_scaled_reward": 0.08110851421952248, "rewards/format_reward": 0.5000000074505806, "step": 45 }, { "completion_length": 2979.9583740234375, "epoch": 0.052571428571428575, "grad_norm": 0.10514923185110092, "kl": 0.00029647350311279297, "learning_rate": 9.2e-07, "loss": 0.0303, "reward": 0.2565183639526367, "reward_std": 0.5196356028318405, "rewards/cosine_scaled_reward": -0.03840749338269234, "rewards/format_reward": 0.3333333432674408, "step": 46 }, { "completion_length": 2650.1458587646484, "epoch": 0.053714285714285714, "grad_norm": 0.15202954411506653, "kl": 0.0002989917993545532, "learning_rate": 9.399999999999999e-07, "loss": 0.0978, "reward": 0.6011475473642349, "reward_std": 0.908449612557888, "rewards/cosine_scaled_reward": 0.04015708714723587, "rewards/format_reward": 0.520833358168602, "step": 47 }, { "completion_length": 2887.9584350585938, "epoch": 0.054857142857142854, "grad_norm": 0.14365942776203156, "kl": 0.0003235340118408203, "learning_rate": 9.6e-07, "loss": 0.1098, "reward": 0.3464186545461416, "reward_std": 0.8909324407577515, "rewards/cosine_scaled_reward": -0.02470733504742384, "rewards/format_reward": 0.3958333395421505, "step": 48 }, { "completion_length": 2294.9792098999023, "epoch": 0.056, "grad_norm": 0.12595273554325104, "kl": 0.0003814399242401123, "learning_rate": 9.8e-07, "loss": 0.0294, "reward": 0.3887506239116192, "reward_std": 0.709479071199894, "rewards/cosine_scaled_reward": -0.08687468431890011, "rewards/format_reward": 0.5625000074505806, "step": 49 }, { "completion_length": 2483.5834350585938, "epoch": 0.05714285714285714, "grad_norm": 0.1347932517528534, "kl": 0.0020999908447265625, "learning_rate": 1e-06, "loss": 0.0097, "reward": 0.48975098691880703, "reward_std": 0.7372790724039078, "rewards/cosine_scaled_reward": 0.015708832070231438, "rewards/format_reward": 0.4583333432674408, "step": 50 }, { "completion_length": 3298.0208740234375, "epoch": 0.05828571428571429, "grad_norm": 0.12307793647050858, "kl": 0.0010235309600830078, "learning_rate": 9.999890338174275e-07, "loss": 0.0291, "reward": -0.01613167393952608, "reward_std": 0.7748741805553436, "rewards/cosine_scaled_reward": -0.1538991741836071, "rewards/format_reward": 0.2916666716337204, "step": 51 }, { "completion_length": 3464.9375610351562, "epoch": 0.05942857142857143, "grad_norm": 0.14545413851737976, "kl": 0.0018963813781738281, "learning_rate": 9.999561358041868e-07, "loss": 0.0448, "reward": -0.23706040158867836, "reward_std": 0.7933510839939117, "rewards/cosine_scaled_reward": -0.18103019893169403, "rewards/format_reward": 0.12500000186264515, "step": 52 }, { "completion_length": 2938.2084350585938, "epoch": 0.060571428571428575, "grad_norm": 0.08684064447879791, "kl": 0.0016429424285888672, "learning_rate": 9.999013075636804e-07, "loss": 0.0463, "reward": -0.056304458528757095, "reward_std": 0.5842409431934357, "rewards/cosine_scaled_reward": -0.18440223019570112, "rewards/format_reward": 0.3125000111758709, "step": 53 }, { "completion_length": 2789.2916870117188, "epoch": 0.061714285714285715, "grad_norm": 0.060190364718437195, "kl": 0.0017528533935546875, "learning_rate": 9.998245517681593e-07, "loss": 0.0171, "reward": -0.11311334511265159, "reward_std": 0.42039141058921814, "rewards/cosine_scaled_reward": -0.23364001512527466, "rewards/format_reward": 0.3541666716337204, "step": 54 }, { "completion_length": 3271.5625, "epoch": 0.06285714285714286, "grad_norm": 0.07129888236522675, "kl": 0.0009405612945556641, "learning_rate": 9.997258721585931e-07, "loss": 0.0159, "reward": -0.34992948174476624, "reward_std": 0.4250538572669029, "rewards/cosine_scaled_reward": -0.24788140505552292, "rewards/format_reward": 0.14583333395421505, "step": 55 }, { "completion_length": 3073.604248046875, "epoch": 0.064, "grad_norm": 0.16036204993724823, "kl": 0.0025844573974609375, "learning_rate": 9.996052735444862e-07, "loss": 0.0509, "reward": 0.015420392155647278, "reward_std": 0.7796643078327179, "rewards/cosine_scaled_reward": -0.11728980112820864, "rewards/format_reward": 0.25000000558793545, "step": 56 }, { "completion_length": 3082.9584350585938, "epoch": 0.06514285714285714, "grad_norm": 0.14083248376846313, "kl": 0.010837554931640625, "learning_rate": 9.994627618036452e-07, "loss": 0.0506, "reward": 0.042304279981181026, "reward_std": 0.7727529257535934, "rewards/cosine_scaled_reward": -0.13509786408394575, "rewards/format_reward": 0.3125000111758709, "step": 57 }, { "completion_length": 3073.3541870117188, "epoch": 0.06628571428571428, "grad_norm": 0.16678181290626526, "kl": 0.003218412399291992, "learning_rate": 9.992983438818915e-07, "loss": 0.0068, "reward": 0.20121465623378754, "reward_std": 0.7175656408071518, "rewards/cosine_scaled_reward": -0.055642676539719105, "rewards/format_reward": 0.3125000074505806, "step": 58 }, { "completion_length": 3008.2709350585938, "epoch": 0.06742857142857143, "grad_norm": 0.1475798785686493, "kl": 0.009433746337890625, "learning_rate": 9.991120277927223e-07, "loss": 0.0125, "reward": 0.4367425888776779, "reward_std": 0.647830456495285, "rewards/cosine_scaled_reward": 0.06212127208709717, "rewards/format_reward": 0.3125000111758709, "step": 59 }, { "completion_length": 2855.6666870117188, "epoch": 0.06857142857142857, "grad_norm": 0.09679862856864929, "kl": 0.00621795654296875, "learning_rate": 9.989038226169207e-07, "loss": 0.0164, "reward": 0.47872328013181686, "reward_std": 0.5911416038870811, "rewards/cosine_scaled_reward": 0.0622783238068223, "rewards/format_reward": 0.35416667722165585, "step": 60 }, { "completion_length": 2144.3750610351562, "epoch": 0.06971428571428571, "grad_norm": 0.05888332054018974, "kl": 0.0020198822021484375, "learning_rate": 9.98673738502114e-07, "loss": 0.0166, "reward": 1.0081715881824493, "reward_std": 0.5063923448324203, "rewards/cosine_scaled_reward": 0.19158576428890228, "rewards/format_reward": 0.625, "step": 61 }, { "completion_length": 3236.3125610351562, "epoch": 0.07085714285714285, "grad_norm": 0.16112229228019714, "kl": 0.0008752346038818359, "learning_rate": 9.98421786662277e-07, "loss": 0.0491, "reward": 0.47718358784914017, "reward_std": 0.9821799397468567, "rewards/cosine_scaled_reward": 0.01984177529811859, "rewards/format_reward": 0.4375000149011612, "step": 62 }, { "completion_length": 2407.416748046875, "epoch": 0.072, "grad_norm": 0.09190040081739426, "kl": 0.009485244750976562, "learning_rate": 9.981479793771866e-07, "loss": 0.0014, "reward": 0.6355759827420115, "reward_std": 0.5608287900686264, "rewards/cosine_scaled_reward": 0.026121314615011215, "rewards/format_reward": 0.5833333358168602, "step": 63 }, { "completion_length": 3042.2708740234375, "epoch": 0.07314285714285715, "grad_norm": 0.11311787366867065, "kl": 0.0009531974792480469, "learning_rate": 9.97852329991824e-07, "loss": 0.0158, "reward": 0.624295711517334, "reward_std": 0.6829620823264122, "rewards/cosine_scaled_reward": 0.1142311654984951, "rewards/format_reward": 0.3958333358168602, "step": 64 }, { "completion_length": 2979.9376220703125, "epoch": 0.07428571428571429, "grad_norm": 0.17287743091583252, "kl": 0.008108139038085938, "learning_rate": 9.975348529157229e-07, "loss": 0.0803, "reward": 0.3465092070400715, "reward_std": 0.8748672604560852, "rewards/cosine_scaled_reward": -0.01424538716673851, "rewards/format_reward": 0.37500000558793545, "step": 65 }, { "completion_length": 3180.5833740234375, "epoch": 0.07542857142857143, "grad_norm": 0.13114774227142334, "kl": 0.0013761520385742188, "learning_rate": 9.971955636222684e-07, "loss": 0.011, "reward": 0.197968615218997, "reward_std": 0.808275930583477, "rewards/cosine_scaled_reward": -0.07809901610016823, "rewards/format_reward": 0.3541666679084301, "step": 66 }, { "completion_length": 3045.2709350585938, "epoch": 0.07657142857142857, "grad_norm": 0.16203086078166962, "kl": 0.0018739700317382812, "learning_rate": 9.968344786479415e-07, "loss": 0.0847, "reward": 0.6482307966798544, "reward_std": 1.029038056731224, "rewards/cosine_scaled_reward": 0.10536541882902384, "rewards/format_reward": 0.43750001676380634, "step": 67 }, { "completion_length": 2498.9166870117188, "epoch": 0.07771428571428571, "grad_norm": 0.06138293072581291, "kl": 0.00598907470703125, "learning_rate": 9.964516155915151e-07, "loss": -0.0034, "reward": 0.12972787162289023, "reward_std": 0.5004179775714874, "rewards/cosine_scaled_reward": -0.12263606488704681, "rewards/format_reward": 0.375, "step": 68 }, { "completion_length": 2794.0834350585938, "epoch": 0.07885714285714286, "grad_norm": 0.1431104838848114, "kl": 0.005124092102050781, "learning_rate": 9.960469931131936e-07, "loss": 0.0283, "reward": 0.5150027610361576, "reward_std": 0.6274815611541271, "rewards/cosine_scaled_reward": 0.01791803538799286, "rewards/format_reward": 0.4791666716337204, "step": 69 }, { "completion_length": 3173.8125610351562, "epoch": 0.08, "grad_norm": 0.146661639213562, "kl": 0.0033349990844726562, "learning_rate": 9.956206309337066e-07, "loss": 0.0731, "reward": 0.23033593781292439, "reward_std": 0.7032231390476227, "rewards/cosine_scaled_reward": -0.04108203295618296, "rewards/format_reward": 0.31250000186264515, "step": 70 }, { "completion_length": 3088.0834350585938, "epoch": 0.08114285714285714, "grad_norm": 0.1698896586894989, "kl": 0.005756378173828125, "learning_rate": 9.951725498333448e-07, "loss": 0.0857, "reward": 0.4810620807111263, "reward_std": 0.7472349628806114, "rewards/cosine_scaled_reward": 0.032197702676057816, "rewards/format_reward": 0.4166666828095913, "step": 71 }, { "completion_length": 2835.9583740234375, "epoch": 0.08228571428571428, "grad_norm": 0.15748044848442078, "kl": 0.005644321441650391, "learning_rate": 9.947027716509488e-07, "loss": 0.0414, "reward": 0.39926697919145226, "reward_std": 0.7735992036759853, "rewards/cosine_scaled_reward": -0.029533179476857185, "rewards/format_reward": 0.4583333395421505, "step": 72 }, { "completion_length": 2668.854248046875, "epoch": 0.08342857142857144, "grad_norm": 0.2273511439561844, "kl": 0.0141448974609375, "learning_rate": 9.942113192828444e-07, "loss": 0.1186, "reward": 0.6719660833477974, "reward_std": 0.9455910921096802, "rewards/cosine_scaled_reward": 0.0859830379486084, "rewards/format_reward": 0.5000000298023224, "step": 73 }, { "completion_length": 2737.291748046875, "epoch": 0.08457142857142858, "grad_norm": 0.16039791703224182, "kl": 0.007320404052734375, "learning_rate": 9.93698216681727e-07, "loss": 0.0567, "reward": 0.31655584648251534, "reward_std": 0.6061973124742508, "rewards/cosine_scaled_reward": -0.03963874280452728, "rewards/format_reward": 0.3958333395421505, "step": 74 }, { "completion_length": 2990.854248046875, "epoch": 0.08571428571428572, "grad_norm": 0.22528968751430511, "kl": 0.007213592529296875, "learning_rate": 9.931634888554935e-07, "loss": 0.1029, "reward": 0.07040337100625038, "reward_std": 0.8260042667388916, "rewards/cosine_scaled_reward": -0.10021498240530491, "rewards/format_reward": 0.27083333767950535, "step": 75 }, { "completion_length": 2957.6459350585938, "epoch": 0.08685714285714285, "grad_norm": 0.12294893711805344, "kl": 0.0023813247680664062, "learning_rate": 9.926071618660237e-07, "loss": 0.0543, "reward": 0.28933531790971756, "reward_std": 0.7524442374706268, "rewards/cosine_scaled_reward": -0.04283232241868973, "rewards/format_reward": 0.3750000074505806, "step": 76 }, { "completion_length": 3163.9583740234375, "epoch": 0.088, "grad_norm": 0.09998784214258194, "kl": 0.003734588623046875, "learning_rate": 9.9202926282791e-07, "loss": 0.0151, "reward": 0.43463192135095596, "reward_std": 0.6034069135785103, "rewards/cosine_scaled_reward": 0.0506493030115962, "rewards/format_reward": 0.3333333432674408, "step": 77 }, { "completion_length": 2789.729217529297, "epoch": 0.08914285714285715, "grad_norm": 0.1028476133942604, "kl": 0.0034427642822265625, "learning_rate": 9.91429819907136e-07, "loss": 0.009, "reward": 0.49053217470645905, "reward_std": 0.671901747584343, "rewards/cosine_scaled_reward": 0.005682730115950108, "rewards/format_reward": 0.4791666716337204, "step": 78 }, { "completion_length": 3021.9584350585938, "epoch": 0.09028571428571429, "grad_norm": 0.14524304866790771, "kl": 0.002349853515625, "learning_rate": 9.908088623197048e-07, "loss": 0.0437, "reward": 0.18685297295451164, "reward_std": 0.82758379727602, "rewards/cosine_scaled_reward": -0.10449018701910973, "rewards/format_reward": 0.3958333469927311, "step": 79 }, { "completion_length": 3422.2916870117188, "epoch": 0.09142857142857143, "grad_norm": 0.35899317264556885, "kl": 0.0026226043701171875, "learning_rate": 9.901664203302124e-07, "loss": 0.0522, "reward": -0.14088810980319977, "reward_std": 0.6001620069146156, "rewards/cosine_scaled_reward": -0.14336072688456625, "rewards/format_reward": 0.1458333358168602, "step": 80 }, { "completion_length": 3328.1458740234375, "epoch": 0.09257142857142857, "grad_norm": 0.1413203924894333, "kl": 0.003086090087890625, "learning_rate": 9.895025252503755e-07, "loss": -0.0009, "reward": 0.20990341156721115, "reward_std": 0.7368708997964859, "rewards/cosine_scaled_reward": -0.04088162397965789, "rewards/format_reward": 0.29166668094694614, "step": 81 }, { "completion_length": 3138.6041870117188, "epoch": 0.09371428571428571, "grad_norm": 0.5630224943161011, "kl": 0.0060558319091796875, "learning_rate": 9.888172094375033e-07, "loss": 0.0801, "reward": 0.005654335021972656, "reward_std": 0.7520733773708344, "rewards/cosine_scaled_reward": -0.14300616830587387, "rewards/format_reward": 0.2916666716337204, "step": 82 }, { "completion_length": 3398.0833740234375, "epoch": 0.09485714285714286, "grad_norm": 0.09970960766077042, "kl": 0.0034198760986328125, "learning_rate": 9.881105062929221e-07, "loss": 0.0172, "reward": -0.2690254710614681, "reward_std": 0.6017113700509071, "rewards/cosine_scaled_reward": -0.2490960769355297, "rewards/format_reward": 0.22916667722165585, "step": 83 }, { "completion_length": 2666.2501220703125, "epoch": 0.096, "grad_norm": 0.2184879034757614, "kl": 0.002471923828125, "learning_rate": 9.873824502603459e-07, "loss": 0.0858, "reward": 1.2698333784937859, "reward_std": 1.1699798554182053, "rewards/cosine_scaled_reward": 0.3119999971240759, "rewards/format_reward": 0.6458333656191826, "step": 84 }, { "completion_length": 3092.9791870117188, "epoch": 0.09714285714285714, "grad_norm": 0.11792045831680298, "kl": 0.0024585723876953125, "learning_rate": 9.866330768241983e-07, "loss": 0.0172, "reward": 0.3196272477507591, "reward_std": 0.7417704239487648, "rewards/cosine_scaled_reward": -0.017269723117351532, "rewards/format_reward": 0.354166679084301, "step": 85 }, { "completion_length": 3099.604248046875, "epoch": 0.09828571428571428, "grad_norm": 0.12413817644119263, "kl": 0.004852294921875, "learning_rate": 9.85862422507884e-07, "loss": 0.0517, "reward": -0.07946242019534111, "reward_std": 0.5531802475452423, "rewards/cosine_scaled_reward": -0.1751478873193264, "rewards/format_reward": 0.2708333432674408, "step": 86 }, { "completion_length": 3024.354278564453, "epoch": 0.09942857142857142, "grad_norm": 0.12308648228645325, "kl": 0.006999969482421875, "learning_rate": 9.850705248720068e-07, "loss": 0.0426, "reward": 0.1297205686569214, "reward_std": 0.7171878144145012, "rewards/cosine_scaled_reward": -0.12263973196968436, "rewards/format_reward": 0.37500000186264515, "step": 87 }, { "completion_length": 2875.6875610351562, "epoch": 0.10057142857142858, "grad_norm": 0.1610432118177414, "kl": 0.014064788818359375, "learning_rate": 9.8425742251254e-07, "loss": 0.0297, "reward": 0.6831055271031801, "reward_std": 0.7087237983942032, "rewards/cosine_scaled_reward": 0.0811360776424408, "rewards/format_reward": 0.5208333432674408, "step": 88 }, { "completion_length": 3291.3959350585938, "epoch": 0.10171428571428572, "grad_norm": 0.14732913672924042, "kl": 0.004520416259765625, "learning_rate": 9.83423155058946e-07, "loss": 0.063, "reward": 0.3873383179306984, "reward_std": 0.9104212373495102, "rewards/cosine_scaled_reward": 0.0374191589653492, "rewards/format_reward": 0.3125000111758709, "step": 89 }, { "completion_length": 3100.7500610351562, "epoch": 0.10285714285714286, "grad_norm": 0.09902340173721313, "kl": 0.005191802978515625, "learning_rate": 9.825677631722435e-07, "loss": 0.0212, "reward": 0.2355214934796095, "reward_std": 0.5521544776856899, "rewards/cosine_scaled_reward": -0.03848925232887268, "rewards/format_reward": 0.3125000074505806, "step": 90 }, { "completion_length": 3321.3958740234375, "epoch": 0.104, "grad_norm": 0.11201111227273941, "kl": 0.0046215057373046875, "learning_rate": 9.816912885430258e-07, "loss": 0.0302, "reward": 0.06314115412533283, "reward_std": 0.6101053357124329, "rewards/cosine_scaled_reward": -0.1246794331818819, "rewards/format_reward": 0.31250000558793545, "step": 91 }, { "completion_length": 2866.9375610351562, "epoch": 0.10514285714285715, "grad_norm": 0.08195216953754425, "kl": 0.00637054443359375, "learning_rate": 9.807937738894303e-07, "loss": 0.0374, "reward": 0.2856922000646591, "reward_std": 0.6180723085999489, "rewards/cosine_scaled_reward": -0.09673722740262747, "rewards/format_reward": 0.4791666716337204, "step": 92 }, { "completion_length": 2626.8333740234375, "epoch": 0.10628571428571429, "grad_norm": 0.0848076120018959, "kl": 0.00502777099609375, "learning_rate": 9.798752629550546e-07, "loss": 0.0158, "reward": 0.47025431878864765, "reward_std": 0.5611053630709648, "rewards/cosine_scaled_reward": 0.005960509181022644, "rewards/format_reward": 0.4583333432674408, "step": 93 }, { "completion_length": 3384.666748046875, "epoch": 0.10742857142857143, "grad_norm": 0.11509731411933899, "kl": 0.005451202392578125, "learning_rate": 9.78935800506826e-07, "loss": 0.0334, "reward": 0.010346372611820698, "reward_std": 0.6185438930988312, "rewards/cosine_scaled_reward": -0.09899348951876163, "rewards/format_reward": 0.2083333358168602, "step": 94 }, { "completion_length": 3308.729248046875, "epoch": 0.10857142857142857, "grad_norm": 0.13493004441261292, "kl": 0.00511932373046875, "learning_rate": 9.779754323328192e-07, "loss": 0.0494, "reward": -0.04175245389342308, "reward_std": 0.819076806306839, "rewards/cosine_scaled_reward": -0.14587622694671154, "rewards/format_reward": 0.2500000111758709, "step": 95 }, { "completion_length": 2638.8333740234375, "epoch": 0.10971428571428571, "grad_norm": 0.1093597412109375, "kl": 0.006412506103515625, "learning_rate": 9.769942052400235e-07, "loss": 0.0613, "reward": 0.2132774479687214, "reward_std": 0.6241517812013626, "rewards/cosine_scaled_reward": -0.1121112871915102, "rewards/format_reward": 0.4375000149011612, "step": 96 }, { "completion_length": 3025.687530517578, "epoch": 0.11085714285714286, "grad_norm": 0.14619475603103638, "kl": 0.017696380615234375, "learning_rate": 9.759921670520634e-07, "loss": 0.0518, "reward": 0.21731913276016712, "reward_std": 0.8663276582956314, "rewards/cosine_scaled_reward": -0.058007098734378815, "rewards/format_reward": 0.33333334513008595, "step": 97 }, { "completion_length": 2998.0833740234375, "epoch": 0.112, "grad_norm": 0.08425849676132202, "kl": 0.011322021484375, "learning_rate": 9.749693666068663e-07, "loss": 0.0316, "reward": -0.0604003369808197, "reward_std": 0.4831971898674965, "rewards/cosine_scaled_reward": -0.17603351920843124, "rewards/format_reward": 0.2916666716337204, "step": 98 }, { "completion_length": 2882.8958435058594, "epoch": 0.11314285714285714, "grad_norm": 0.10733813792467117, "kl": 0.0042572021484375, "learning_rate": 9.739258537542835e-07, "loss": 0.0469, "reward": 0.37905219942331314, "reward_std": 0.6325190886855125, "rewards/cosine_scaled_reward": 0.012442763894796371, "rewards/format_reward": 0.35416667722165585, "step": 99 }, { "completion_length": 3077.979248046875, "epoch": 0.11428571428571428, "grad_norm": 0.22007572650909424, "kl": 0.00611114501953125, "learning_rate": 9.728616793536587e-07, "loss": 0.0684, "reward": 0.15971739403903484, "reward_std": 0.8245379701256752, "rewards/cosine_scaled_reward": -0.0972246453166008, "rewards/format_reward": 0.3541666716337204, "step": 100 }, { "completion_length": 3149.5000610351562, "epoch": 0.11542857142857142, "grad_norm": 0.17998212575912476, "kl": 0.0086212158203125, "learning_rate": 9.717768952713511e-07, "loss": 0.0924, "reward": -0.043516192585229874, "reward_std": 0.7394061759114265, "rewards/cosine_scaled_reward": -0.1467580944299698, "rewards/format_reward": 0.25000001303851604, "step": 101 }, { "completion_length": 2830.2500610351562, "epoch": 0.11657142857142858, "grad_norm": 0.10636850446462631, "kl": 0.006778717041015625, "learning_rate": 9.706715543782064e-07, "loss": 0.0118, "reward": 0.20193170942366123, "reward_std": 0.5816469639539719, "rewards/cosine_scaled_reward": -0.09695081505924463, "rewards/format_reward": 0.3958333358168602, "step": 102 }, { "completion_length": 3253.354248046875, "epoch": 0.11771428571428572, "grad_norm": 0.10601601004600525, "kl": 0.0059051513671875, "learning_rate": 9.695457105469804e-07, "loss": 0.0393, "reward": 0.16332483664155006, "reward_std": 0.7165435254573822, "rewards/cosine_scaled_reward": -0.07458756864070892, "rewards/format_reward": 0.3125000149011612, "step": 103 }, { "completion_length": 2784.0416870117188, "epoch": 0.11885714285714286, "grad_norm": 0.14525532722473145, "kl": 0.00762176513671875, "learning_rate": 9.683994186497132e-07, "loss": 0.0257, "reward": 0.6941813006997108, "reward_std": 0.731097511947155, "rewards/cosine_scaled_reward": 0.13875730894505978, "rewards/format_reward": 0.41666667722165585, "step": 104 }, { "completion_length": 3037.291748046875, "epoch": 0.12, "grad_norm": 0.10406464338302612, "kl": 0.0091552734375, "learning_rate": 9.672327345550543e-07, "loss": 0.0217, "reward": -0.03945709019899368, "reward_std": 0.5527790486812592, "rewards/cosine_scaled_reward": -0.14472855255007744, "rewards/format_reward": 0.25000000186264515, "step": 105 }, { "completion_length": 3007.6250610351562, "epoch": 0.12114285714285715, "grad_norm": 0.1392635703086853, "kl": 0.00736236572265625, "learning_rate": 9.66045715125541e-07, "loss": 0.0638, "reward": 0.2589884400367737, "reward_std": 0.8927985578775406, "rewards/cosine_scaled_reward": -0.05800577998161316, "rewards/format_reward": 0.3750000111758709, "step": 106 }, { "completion_length": 2705.52099609375, "epoch": 0.12228571428571429, "grad_norm": 0.19877693057060242, "kl": 0.00640869140625, "learning_rate": 9.648384182148252e-07, "loss": 0.0852, "reward": 0.42868572287261486, "reward_std": 0.7907231077551842, "rewards/cosine_scaled_reward": -0.025240465998649597, "rewards/format_reward": 0.4791666716337204, "step": 107 }, { "completion_length": 2601.9793090820312, "epoch": 0.12342857142857143, "grad_norm": 0.1907849907875061, "kl": 0.010498046875, "learning_rate": 9.636109026648554e-07, "loss": 0.0862, "reward": 1.0781057141721249, "reward_std": 0.926390677690506, "rewards/cosine_scaled_reward": 0.2578028216958046, "rewards/format_reward": 0.5625, "step": 108 }, { "completion_length": 2873.500030517578, "epoch": 0.12457142857142857, "grad_norm": 0.12728413939476013, "kl": 0.00748443603515625, "learning_rate": 9.623632283030077e-07, "loss": 0.0443, "reward": 0.2420949712395668, "reward_std": 0.6641058176755905, "rewards/cosine_scaled_reward": -0.0872858352959156, "rewards/format_reward": 0.416666679084301, "step": 109 }, { "completion_length": 2759.041748046875, "epoch": 0.12571428571428572, "grad_norm": 0.3926822543144226, "kl": 0.0103759765625, "learning_rate": 9.610954559391704e-07, "loss": 0.051, "reward": 0.7485219649970531, "reward_std": 1.0151629000902176, "rewards/cosine_scaled_reward": 0.07217762316577137, "rewards/format_reward": 0.6041666865348816, "step": 110 }, { "completion_length": 1981.291748046875, "epoch": 0.12685714285714286, "grad_norm": 0.13100939989089966, "kl": 0.0102081298828125, "learning_rate": 9.598076473627796e-07, "loss": -0.018, "reward": 0.8073812872171402, "reward_std": 0.8186813145875931, "rewards/cosine_scaled_reward": 0.028690634877420962, "rewards/format_reward": 0.7500000149011612, "step": 111 }, { "completion_length": 3106.9583740234375, "epoch": 0.128, "grad_norm": 0.18594208359718323, "kl": 0.01175689697265625, "learning_rate": 9.58499865339809e-07, "loss": 0.0913, "reward": 0.3422376364469528, "reward_std": 0.8253115490078926, "rewards/cosine_scaled_reward": -0.016381196677684784, "rewards/format_reward": 0.3750000223517418, "step": 112 }, { "completion_length": 2985.0208740234375, "epoch": 0.12914285714285714, "grad_norm": 0.10086725652217865, "kl": 0.0164794921875, "learning_rate": 9.571721736097088e-07, "loss": 0.026, "reward": 0.6304376311600208, "reward_std": 0.6578450873494148, "rewards/cosine_scaled_reward": 0.10688545554876328, "rewards/format_reward": 0.41666667722165585, "step": 113 }, { "completion_length": 2085.0416870117188, "epoch": 0.13028571428571428, "grad_norm": 0.1902545839548111, "kl": 0.0139923095703125, "learning_rate": 9.55824636882301e-07, "loss": 0.0715, "reward": 0.9513098001480103, "reward_std": 0.9133107215166092, "rewards/cosine_scaled_reward": 0.13190488796681166, "rewards/format_reward": 0.6875000149011612, "step": 114 }, { "completion_length": 3096.6250610351562, "epoch": 0.13142857142857142, "grad_norm": 0.1532527357339859, "kl": 0.01084136962890625, "learning_rate": 9.54457320834625e-07, "loss": 0.0594, "reward": 0.5540619897656143, "reward_std": 0.9744190573692322, "rewards/cosine_scaled_reward": 0.05828099511563778, "rewards/format_reward": 0.43750000558793545, "step": 115 }, { "completion_length": 3017.5833740234375, "epoch": 0.13257142857142856, "grad_norm": 0.1256159394979477, "kl": 0.012542724609375, "learning_rate": 9.530702921077358e-07, "loss": 0.0515, "reward": -0.0034197866916656494, "reward_std": 0.6141533181071281, "rewards/cosine_scaled_reward": -0.12670988403260708, "rewards/format_reward": 0.25, "step": 116 }, { "completion_length": 3360.916748046875, "epoch": 0.1337142857142857, "grad_norm": 0.13030223548412323, "kl": 0.010650634765625, "learning_rate": 9.516636183034564e-07, "loss": 0.0416, "reward": -0.01844558771699667, "reward_std": 0.7771024033427238, "rewards/cosine_scaled_reward": -0.13422280363738537, "rewards/format_reward": 0.2500000111758709, "step": 117 }, { "completion_length": 2493.2709350585938, "epoch": 0.13485714285714287, "grad_norm": 0.10445129871368408, "kl": 0.01708984375, "learning_rate": 9.502373679810839e-07, "loss": 0.045, "reward": 0.7345311008393764, "reward_std": 0.6608476266264915, "rewards/cosine_scaled_reward": 0.054765526205301285, "rewards/format_reward": 0.6250000055879354, "step": 118 }, { "completion_length": 2583.8334045410156, "epoch": 0.136, "grad_norm": 0.1751917004585266, "kl": 0.01385498046875, "learning_rate": 9.487916106540465e-07, "loss": 0.1151, "reward": 0.16756585985422134, "reward_std": 0.6609668508172035, "rewards/cosine_scaled_reward": -0.1558004072867334, "rewards/format_reward": 0.4791666939854622, "step": 119 }, { "completion_length": 3435.6458740234375, "epoch": 0.13714285714285715, "grad_norm": 0.20698896050453186, "kl": 0.01154327392578125, "learning_rate": 9.473264167865171e-07, "loss": 0.0604, "reward": -0.05263599753379822, "reward_std": 1.0508478283882141, "rewards/cosine_scaled_reward": -0.17215134110301733, "rewards/format_reward": 0.2916666716337204, "step": 120 }, { "completion_length": 3101.875, "epoch": 0.1382857142857143, "grad_norm": 0.10516638308763504, "kl": 0.012359619140625, "learning_rate": 9.458418577899774e-07, "loss": 0.0121, "reward": 0.045499179512262344, "reward_std": 0.5043403655290604, "rewards/cosine_scaled_reward": -0.10225043445825577, "rewards/format_reward": 0.2500000111758709, "step": 121 }, { "completion_length": 3044.3541870117188, "epoch": 0.13942857142857143, "grad_norm": 0.10074342042207718, "kl": 0.019744873046875, "learning_rate": 9.443380060197385e-07, "loss": 0.0442, "reward": 0.021381250582635403, "reward_std": 0.5577950775623322, "rewards/cosine_scaled_reward": -0.13514270819723606, "rewards/format_reward": 0.2916666716337204, "step": 122 }, { "completion_length": 2913.7083740234375, "epoch": 0.14057142857142857, "grad_norm": 0.14308768510818481, "kl": 0.0152587890625, "learning_rate": 9.428149347714143e-07, "loss": 0.0878, "reward": 0.12965750694274902, "reward_std": 0.736047625541687, "rewards/cosine_scaled_reward": -0.09142125025391579, "rewards/format_reward": 0.31250000558793545, "step": 123 }, { "completion_length": 2562.5000610351562, "epoch": 0.1417142857142857, "grad_norm": 0.19142040610313416, "kl": 0.01031494140625, "learning_rate": 9.412727182773486e-07, "loss": 0.065, "reward": 0.8353077471256256, "reward_std": 1.026055485010147, "rewards/cosine_scaled_reward": 0.12598720658570528, "rewards/format_reward": 0.5833333507180214, "step": 124 }, { "completion_length": 3017.6251220703125, "epoch": 0.14285714285714285, "grad_norm": 0.117274209856987, "kl": 0.009552001953125, "learning_rate": 9.397114317029974e-07, "loss": 0.0076, "reward": 0.1632972015067935, "reward_std": 0.5557524636387825, "rewards/cosine_scaled_reward": -0.10585140064358711, "rewards/format_reward": 0.37500000558793545, "step": 125 }, { "completion_length": 2858.8334350585938, "epoch": 0.144, "grad_norm": 0.2655041217803955, "kl": 0.01821136474609375, "learning_rate": 9.381311511432658e-07, "loss": 0.0873, "reward": 0.30082017183303833, "reward_std": 0.9569597989320755, "rewards/cosine_scaled_reward": -0.06833992386236787, "rewards/format_reward": 0.4375000074505806, "step": 126 }, { "completion_length": 2871.2083435058594, "epoch": 0.14514285714285713, "grad_norm": 0.0872960090637207, "kl": 0.0139007568359375, "learning_rate": 9.36531953618799e-07, "loss": -0.0046, "reward": 0.1562192291021347, "reward_std": 0.58997593075037, "rewards/cosine_scaled_reward": -0.10939039289951324, "rewards/format_reward": 0.37500000558793545, "step": 127 }, { "completion_length": 3199.2291870117188, "epoch": 0.1462857142857143, "grad_norm": 0.21217796206474304, "kl": 0.020172119140625, "learning_rate": 9.34913917072228e-07, "loss": 0.0995, "reward": 0.07891843095421791, "reward_std": 0.858635775744915, "rewards/cosine_scaled_reward": -0.10637411894276738, "rewards/format_reward": 0.2916666716337204, "step": 128 }, { "completion_length": 2658.1458435058594, "epoch": 0.14742857142857144, "grad_norm": 0.13081493973731995, "kl": 0.0191650390625, "learning_rate": 9.332771203643714e-07, "loss": 0.0418, "reward": 0.2748406231403351, "reward_std": 0.6719504073262215, "rewards/cosine_scaled_reward": -0.10216302564367652, "rewards/format_reward": 0.4791666753590107, "step": 129 }, { "completion_length": 3460.2291870117188, "epoch": 0.14857142857142858, "grad_norm": 0.12681667506694794, "kl": 0.01409912109375, "learning_rate": 9.316216432703916e-07, "loss": 0.0399, "reward": -0.11496437340974808, "reward_std": 0.6864899545907974, "rewards/cosine_scaled_reward": -0.15123217983637005, "rewards/format_reward": 0.18750000558793545, "step": 130 }, { "completion_length": 3362.3750610351562, "epoch": 0.14971428571428572, "grad_norm": 0.12439722567796707, "kl": 0.01568603515625, "learning_rate": 9.299475664759068e-07, "loss": 0.0316, "reward": 0.17270515114068985, "reward_std": 0.636282742023468, "rewards/cosine_scaled_reward": -0.01781410351395607, "rewards/format_reward": 0.20833334513008595, "step": 131 }, { "completion_length": 3433.3333740234375, "epoch": 0.15085714285714286, "grad_norm": 0.13320712745189667, "kl": 0.020172119140625, "learning_rate": 9.282549715730579e-07, "loss": 0.0099, "reward": -0.2513204962015152, "reward_std": 0.6501054912805557, "rewards/cosine_scaled_reward": -0.2298269160091877, "rewards/format_reward": 0.2083333432674408, "step": 132 }, { "completion_length": 3220.1459350585938, "epoch": 0.152, "grad_norm": 0.17302778363227844, "kl": 0.01995849609375, "learning_rate": 9.265439410565328e-07, "loss": 0.0398, "reward": -0.11768799647688866, "reward_std": 0.6951716169714928, "rewards/cosine_scaled_reward": -0.22551067918539047, "rewards/format_reward": 0.3333333544433117, "step": 133 }, { "completion_length": 2314.5416870117188, "epoch": 0.15314285714285714, "grad_norm": 0.0858488380908966, "kl": 0.025665283203125, "learning_rate": 9.248145583195447e-07, "loss": 0.0092, "reward": 0.602238692343235, "reward_std": 0.563841238617897, "rewards/cosine_scaled_reward": -0.011380670592188835, "rewards/format_reward": 0.625, "step": 134 }, { "completion_length": 2965.8750610351562, "epoch": 0.15428571428571428, "grad_norm": 0.17062057554721832, "kl": 0.019134521484375, "learning_rate": 9.230669076497687e-07, "loss": 0.045, "reward": 0.17675711959600449, "reward_std": 0.5801602862775326, "rewards/cosine_scaled_reward": -0.05745477043092251, "rewards/format_reward": 0.2916666679084301, "step": 135 }, { "completion_length": 2485.8334045410156, "epoch": 0.15542857142857142, "grad_norm": 0.13649305701255798, "kl": 0.021697998046875, "learning_rate": 9.213010742252327e-07, "loss": 0.0181, "reward": 0.6858363393694162, "reward_std": 0.8353622853755951, "rewards/cosine_scaled_reward": 0.009584830142557621, "rewards/format_reward": 0.6666666865348816, "step": 136 }, { "completion_length": 3372.6043090820312, "epoch": 0.15657142857142858, "grad_norm": 0.12744168937206268, "kl": 0.0316314697265625, "learning_rate": 9.195171441101668e-07, "loss": 0.0593, "reward": -0.08887681737542152, "reward_std": 0.6366704031825066, "rewards/cosine_scaled_reward": -0.1486050896346569, "rewards/format_reward": 0.2083333358168602, "step": 137 }, { "completion_length": 2583.1250915527344, "epoch": 0.15771428571428572, "grad_norm": 0.1180926188826561, "kl": 0.0181732177734375, "learning_rate": 9.177152042508077e-07, "loss": 0.035, "reward": 0.8022582903504372, "reward_std": 0.7210212647914886, "rewards/cosine_scaled_reward": 0.11987911909818649, "rewards/format_reward": 0.5625000111758709, "step": 138 }, { "completion_length": 2719.5208435058594, "epoch": 0.15885714285714286, "grad_norm": 0.13920994102954865, "kl": 0.0205841064453125, "learning_rate": 9.158953424711624e-07, "loss": 0.0124, "reward": 0.43635744601488113, "reward_std": 0.7499766424298286, "rewards/cosine_scaled_reward": -0.042237947694957256, "rewards/format_reward": 0.520833345130086, "step": 139 }, { "completion_length": 2655.6251220703125, "epoch": 0.16, "grad_norm": 0.12660294771194458, "kl": 0.01995849609375, "learning_rate": 9.140576474687263e-07, "loss": 0.0166, "reward": 0.6822620648890734, "reward_std": 0.6412546709179878, "rewards/cosine_scaled_reward": 0.049464356154203415, "rewards/format_reward": 0.5833333432674408, "step": 140 }, { "completion_length": 2931.5208740234375, "epoch": 0.16114285714285714, "grad_norm": 0.21838468313217163, "kl": 0.023284912109375, "learning_rate": 9.122022088101613e-07, "loss": 0.0897, "reward": 0.5501389801502228, "reward_std": 0.931708961725235, "rewards/cosine_scaled_reward": 0.05631948262453079, "rewards/format_reward": 0.4375000074505806, "step": 141 }, { "completion_length": 2731.5209350585938, "epoch": 0.16228571428571428, "grad_norm": 0.1206783875823021, "kl": 0.0212860107421875, "learning_rate": 9.103291169269299e-07, "loss": 0.0285, "reward": 0.44770222902297974, "reward_std": 0.6320216841995716, "rewards/cosine_scaled_reward": 0.01551777683198452, "rewards/format_reward": 0.41666667722165585, "step": 142 }, { "completion_length": 3003.0833740234375, "epoch": 0.16342857142857142, "grad_norm": 0.1385820508003235, "kl": 0.0207366943359375, "learning_rate": 9.084384631108882e-07, "loss": 0.0375, "reward": 0.36010952293872833, "reward_std": 0.6810671910643578, "rewards/cosine_scaled_reward": -0.007445234805345535, "rewards/format_reward": 0.37500001676380634, "step": 143 }, { "completion_length": 3069.2084350585938, "epoch": 0.16457142857142856, "grad_norm": 0.24886491894721985, "kl": 0.023193359375, "learning_rate": 9.065303395098358e-07, "loss": 0.0784, "reward": 0.5813055820763111, "reward_std": 1.03695610165596, "rewards/cosine_scaled_reward": 0.030236128717660904, "rewards/format_reward": 0.520833358168602, "step": 144 }, { "completion_length": 3248.541748046875, "epoch": 0.1657142857142857, "grad_norm": 0.27944961190223694, "kl": 0.026947021484375, "learning_rate": 9.046048391230247e-07, "loss": 0.0821, "reward": 0.31095648277550936, "reward_std": 1.044460952281952, "rewards/cosine_scaled_reward": -0.0007717590779066086, "rewards/format_reward": 0.3125000037252903, "step": 145 }, { "completion_length": 3391.2918090820312, "epoch": 0.16685714285714287, "grad_norm": 0.1663837432861328, "kl": 0.033966064453125, "learning_rate": 9.026620557966279e-07, "loss": 0.0146, "reward": 0.22623740322887897, "reward_std": 0.7760383784770966, "rewards/cosine_scaled_reward": -0.10563132539391518, "rewards/format_reward": 0.4375000111758709, "step": 146 }, { "completion_length": 2764.3750610351562, "epoch": 0.168, "grad_norm": 0.15888190269470215, "kl": 0.0340576171875, "learning_rate": 9.007020842191634e-07, "loss": 0.02, "reward": 0.43453994020819664, "reward_std": 0.6980537474155426, "rewards/cosine_scaled_reward": -0.011896707117557526, "rewards/format_reward": 0.4583333395421505, "step": 147 }, { "completion_length": 3387.8125610351562, "epoch": 0.16914285714285715, "grad_norm": 0.1351691633462906, "kl": 0.0419921875, "learning_rate": 8.987250199168808e-07, "loss": 0.0369, "reward": -0.3159765365999192, "reward_std": 0.5913000628352165, "rewards/cosine_scaled_reward": -0.2517382688820362, "rewards/format_reward": 0.18750000558793545, "step": 148 }, { "completion_length": 3070.1458740234375, "epoch": 0.1702857142857143, "grad_norm": 0.13587744534015656, "kl": 0.0269927978515625, "learning_rate": 8.967309592491052e-07, "loss": 0.0154, "reward": 0.8058477342128754, "reward_std": 0.6222796887159348, "rewards/cosine_scaled_reward": 0.17375719547271729, "rewards/format_reward": 0.4583333432674408, "step": 149 }, { "completion_length": 2798.2084350585938, "epoch": 0.17142857142857143, "grad_norm": 0.15157835185527802, "kl": 0.03900146484375, "learning_rate": 8.9471999940354e-07, "loss": 0.038, "reward": 0.05908125883433968, "reward_std": 0.7354179471731186, "rewards/cosine_scaled_reward": -0.21004271879792213, "rewards/format_reward": 0.4791666679084301, "step": 150 }, { "completion_length": 2979.8541870117188, "epoch": 0.17257142857142857, "grad_norm": 0.08188746124505997, "kl": 0.028045654296875, "learning_rate": 8.926922383915315e-07, "loss": -0.004, "reward": -0.011732706800103188, "reward_std": 0.44251058250665665, "rewards/cosine_scaled_reward": -0.1621163571253419, "rewards/format_reward": 0.3125, "step": 151 }, { "completion_length": 3252.1458740234375, "epoch": 0.1737142857142857, "grad_norm": 0.17107248306274414, "kl": 0.039520263671875, "learning_rate": 8.906477750432903e-07, "loss": 0.0199, "reward": 0.2528679259121418, "reward_std": 0.782855249941349, "rewards/cosine_scaled_reward": 0.0014339573681354523, "rewards/format_reward": 0.2500000111758709, "step": 152 }, { "completion_length": 2536.1458740234375, "epoch": 0.17485714285714285, "grad_norm": 0.13167858123779297, "kl": 0.02642822265625, "learning_rate": 8.88586709003076e-07, "loss": 0.0437, "reward": 0.8370774015784264, "reward_std": 0.7839193791151047, "rewards/cosine_scaled_reward": 0.08520536310970783, "rewards/format_reward": 0.6666666865348816, "step": 153 }, { "completion_length": 2881.3125610351562, "epoch": 0.176, "grad_norm": 0.20085100829601288, "kl": 0.0284423828125, "learning_rate": 8.865091407243394e-07, "loss": 0.0533, "reward": 0.6227563321590424, "reward_std": 0.8027107864618301, "rewards/cosine_scaled_reward": 0.0717947967350483, "rewards/format_reward": 0.4791666828095913, "step": 154 }, { "completion_length": 3419.541748046875, "epoch": 0.17714285714285713, "grad_norm": 0.1839601695537567, "kl": 0.03076171875, "learning_rate": 8.844151714648274e-07, "loss": -0.0021, "reward": 0.12460730504244566, "reward_std": 0.942700669169426, "rewards/cosine_scaled_reward": -0.07311302423477173, "rewards/format_reward": 0.2708333432674408, "step": 155 }, { "completion_length": 3294.9791870117188, "epoch": 0.1782857142857143, "grad_norm": 0.15247705578804016, "kl": 0.03961181640625, "learning_rate": 8.823049032816478e-07, "loss": 0.0569, "reward": -0.32880749367177486, "reward_std": 0.5324635952711105, "rewards/cosine_scaled_reward": -0.2685704119503498, "rewards/format_reward": 0.20833334140479565, "step": 156 }, { "completion_length": 2784.7916870117188, "epoch": 0.17942857142857144, "grad_norm": 0.29496413469314575, "kl": 0.0323944091796875, "learning_rate": 8.801784390262943e-07, "loss": 0.0628, "reward": 0.37049394473433495, "reward_std": 1.1466023474931717, "rewards/cosine_scaled_reward": -0.07516971230506897, "rewards/format_reward": 0.5208333432674408, "step": 157 }, { "completion_length": 3182.6875610351562, "epoch": 0.18057142857142858, "grad_norm": 0.1029396653175354, "kl": 0.04150390625, "learning_rate": 8.780358823396352e-07, "loss": 0.024, "reward": -0.27919139340519905, "reward_std": 0.5330808311700821, "rewards/cosine_scaled_reward": -0.2541790306568146, "rewards/format_reward": 0.22916666977107525, "step": 158 }, { "completion_length": 2943.6250610351562, "epoch": 0.18171428571428572, "grad_norm": 0.4807628393173218, "kl": 0.05224609375, "learning_rate": 8.758773376468604e-07, "loss": 0.136, "reward": 0.4126173257827759, "reward_std": 0.9520216137170792, "rewards/cosine_scaled_reward": -0.012441340368241072, "rewards/format_reward": 0.4375000149011612, "step": 159 }, { "completion_length": 2988.7291870117188, "epoch": 0.18285714285714286, "grad_norm": 0.1530563086271286, "kl": 0.041748046875, "learning_rate": 8.737029101523929e-07, "loss": 0.0217, "reward": 0.5735020600259304, "reward_std": 0.8126933425664902, "rewards/cosine_scaled_reward": 0.03675099462270737, "rewards/format_reward": 0.5000000149011612, "step": 160 }, { "completion_length": 3521.916748046875, "epoch": 0.184, "grad_norm": 0.18781894445419312, "kl": 0.04571533203125, "learning_rate": 8.715127058347614e-07, "loss": 0.0335, "reward": -0.04885682836174965, "reward_std": 0.8325313180685043, "rewards/cosine_scaled_reward": -0.12859507277607918, "rewards/format_reward": 0.2083333395421505, "step": 161 }, { "completion_length": 3144.6875, "epoch": 0.18514285714285714, "grad_norm": 0.15695880353450775, "kl": 0.0509033203125, "learning_rate": 8.693068314414344e-07, "loss": 0.033, "reward": 0.5102378875017166, "reward_std": 0.7466369420289993, "rewards/cosine_scaled_reward": -0.02613106439821422, "rewards/format_reward": 0.5625000111758709, "step": 162 }, { "completion_length": 2595.2083435058594, "epoch": 0.18628571428571428, "grad_norm": 0.25721773505210876, "kl": 0.056854248046875, "learning_rate": 8.670853944836176e-07, "loss": -0.002, "reward": 0.3704167567193508, "reward_std": 0.6248408891260624, "rewards/cosine_scaled_reward": -0.06479163467884064, "rewards/format_reward": 0.5000000055879354, "step": 163 }, { "completion_length": 3132.6458740234375, "epoch": 0.18742857142857142, "grad_norm": 0.3121108412742615, "kl": 0.066162109375, "learning_rate": 8.648485032310144e-07, "loss": 0.0436, "reward": -0.15848805382847786, "reward_std": 0.6006623804569244, "rewards/cosine_scaled_reward": -0.21466069296002388, "rewards/format_reward": 0.27083334140479565, "step": 164 }, { "completion_length": 3106.7918090820312, "epoch": 0.18857142857142858, "grad_norm": 0.36176592111587524, "kl": 0.059814453125, "learning_rate": 8.625962667065487e-07, "loss": 0.0527, "reward": 0.523316752165556, "reward_std": 1.1928484439849854, "rewards/cosine_scaled_reward": 0.022075051441788673, "rewards/format_reward": 0.4791666865348816, "step": 165 }, { "completion_length": 2909.6458435058594, "epoch": 0.18971428571428572, "grad_norm": 0.09995611011981964, "kl": 0.0648193359375, "learning_rate": 8.603287946810513e-07, "loss": 0.0269, "reward": 0.30894866585731506, "reward_std": 0.543118342757225, "rewards/cosine_scaled_reward": -0.022608992643654346, "rewards/format_reward": 0.35416666977107525, "step": 166 }, { "completion_length": 3123.3750610351562, "epoch": 0.19085714285714286, "grad_norm": 0.19064339995384216, "kl": 0.065673828125, "learning_rate": 8.580461976679099e-07, "loss": 0.0316, "reward": -0.04249940067529678, "reward_std": 0.6464731246232986, "rewards/cosine_scaled_reward": -0.2087497040629387, "rewards/format_reward": 0.3750000111758709, "step": 167 }, { "completion_length": 2451.8958740234375, "epoch": 0.192, "grad_norm": 0.2923497259616852, "kl": 0.06597900390625, "learning_rate": 8.557485869176825e-07, "loss": 0.0505, "reward": 0.5589314834214747, "reward_std": 0.7117247879505157, "rewards/cosine_scaled_reward": -0.03303426876664162, "rewards/format_reward": 0.6250000149011612, "step": 168 }, { "completion_length": 2313.9375610351562, "epoch": 0.19314285714285714, "grad_norm": 0.22615736722946167, "kl": 0.0550537109375, "learning_rate": 8.534360744126753e-07, "loss": 0.0106, "reward": 0.9692112673074007, "reward_std": 0.9811852872371674, "rewards/cosine_scaled_reward": 0.1616889564320445, "rewards/format_reward": 0.6458333507180214, "step": 169 }, { "completion_length": 2736.4375610351562, "epoch": 0.19428571428571428, "grad_norm": 0.17036058008670807, "kl": 0.08740234375, "learning_rate": 8.511087728614862e-07, "loss": 0.0282, "reward": 0.17224126309156418, "reward_std": 0.5562086030840874, "rewards/cosine_scaled_reward": -0.09096270857844502, "rewards/format_reward": 0.35416666977107525, "step": 170 }, { "completion_length": 2596.1458740234375, "epoch": 0.19542857142857142, "grad_norm": 0.2933753728866577, "kl": 0.1033935546875, "learning_rate": 8.487667956935087e-07, "loss": -0.0277, "reward": 0.3066958854906261, "reward_std": 1.1108788549900055, "rewards/cosine_scaled_reward": 0.0283479536883533, "rewards/format_reward": 0.2500000074505806, "step": 171 }, { "completion_length": 2705.1251220703125, "epoch": 0.19657142857142856, "grad_norm": 0.18700729310512543, "kl": 0.1024169921875, "learning_rate": 8.464102570534061e-07, "loss": 0.0173, "reward": 0.4773051217198372, "reward_std": 0.8035851642489433, "rewards/cosine_scaled_reward": -0.032180776819586754, "rewards/format_reward": 0.5416666828095913, "step": 172 }, { "completion_length": 2499.479248046875, "epoch": 0.1977142857142857, "grad_norm": 0.5721752643585205, "kl": 0.1807861328125, "learning_rate": 8.440392717955475e-07, "loss": 0.0732, "reward": 0.582635186612606, "reward_std": 0.9862835854291916, "rewards/cosine_scaled_reward": -0.010765749961137772, "rewards/format_reward": 0.6041666865348816, "step": 173 }, { "completion_length": 3085.5000610351562, "epoch": 0.19885714285714284, "grad_norm": 0.20046721398830414, "kl": 0.103759765625, "learning_rate": 8.416539554784089e-07, "loss": 0.0112, "reward": 0.35188272781670094, "reward_std": 0.5055751278996468, "rewards/cosine_scaled_reward": -0.032391976565122604, "rewards/format_reward": 0.41666667722165585, "step": 174 }, { "completion_length": 2600.9584350585938, "epoch": 0.2, "grad_norm": 0.27946504950523376, "kl": 0.1097412109375, "learning_rate": 8.392544243589427e-07, "loss": 0.0436, "reward": 0.622465105727315, "reward_std": 0.4762147720903158, "rewards/cosine_scaled_reward": 0.0716492049396038, "rewards/format_reward": 0.4791666679084301, "step": 175 }, { "completion_length": 2800.7084350585938, "epoch": 0.20114285714285715, "grad_norm": 0.27955377101898193, "kl": 0.1202392578125, "learning_rate": 8.368407953869103e-07, "loss": 0.0389, "reward": 0.4877171404659748, "reward_std": 0.9056157171726227, "rewards/cosine_scaled_reward": -0.006141431163996458, "rewards/format_reward": 0.5000000149011612, "step": 176 }, { "completion_length": 2680.6459350585938, "epoch": 0.2022857142857143, "grad_norm": 0.2823414206504822, "kl": 0.1099853515625, "learning_rate": 8.344131861991828e-07, "loss": 0.0552, "reward": 0.033542659133672714, "reward_std": 0.5713647753000259, "rewards/cosine_scaled_reward": -0.1603120118379593, "rewards/format_reward": 0.3541666679084301, "step": 177 }, { "completion_length": 2046.5625305175781, "epoch": 0.20342857142857143, "grad_norm": 0.20538190007209778, "kl": 0.1141357421875, "learning_rate": 8.319717151140072e-07, "loss": 0.0464, "reward": 0.728565389290452, "reward_std": 0.6446417346596718, "rewards/cosine_scaled_reward": 0.07261601462960243, "rewards/format_reward": 0.583333358168602, "step": 178 }, { "completion_length": 2761.5000610351562, "epoch": 0.20457142857142857, "grad_norm": 0.40644508600234985, "kl": 0.1458740234375, "learning_rate": 8.295165011252396e-07, "loss": 0.0513, "reward": 0.404015829320997, "reward_std": 0.853428527712822, "rewards/cosine_scaled_reward": -0.03757544606924057, "rewards/format_reward": 0.4791666828095913, "step": 179 }, { "completion_length": 2943.8958740234375, "epoch": 0.2057142857142857, "grad_norm": 0.5542572736740112, "kl": 0.16357421875, "learning_rate": 8.270476638965461e-07, "loss": 0.0718, "reward": 0.4506250247359276, "reward_std": 0.7890695706009865, "rewards/cosine_scaled_reward": -0.045520816929638386, "rewards/format_reward": 0.5416666716337204, "step": 180 }, { "completion_length": 3004.7709045410156, "epoch": 0.20685714285714285, "grad_norm": 0.2843971252441406, "kl": 0.154541015625, "learning_rate": 8.245653237555705e-07, "loss": 0.0082, "reward": 0.4879231466911733, "reward_std": 0.9720990136265755, "rewards/cosine_scaled_reward": 0.014794901013374329, "rewards/format_reward": 0.4583333358168602, "step": 181 }, { "completion_length": 2924.9375610351562, "epoch": 0.208, "grad_norm": 0.5410143136978149, "kl": 0.20361328125, "learning_rate": 8.220696016880687e-07, "loss": 0.0912, "reward": 0.005998063832521439, "reward_std": 0.7128682732582092, "rewards/cosine_scaled_reward": -0.18450098019093275, "rewards/format_reward": 0.3750000111758709, "step": 182 }, { "completion_length": 2788.166748046875, "epoch": 0.20914285714285713, "grad_norm": 0.33847859501838684, "kl": 0.1676025390625, "learning_rate": 8.195606193320136e-07, "loss": 0.0478, "reward": -0.0011163651943206787, "reward_std": 0.5493139624595642, "rewards/cosine_scaled_reward": -0.24014152213931084, "rewards/format_reward": 0.479166679084301, "step": 183 }, { "completion_length": 2438.3541870117188, "epoch": 0.2102857142857143, "grad_norm": 0.5205087065696716, "kl": 0.181884765625, "learning_rate": 8.170384989716657e-07, "loss": -0.002, "reward": 0.896189346909523, "reward_std": 1.161486804485321, "rewards/cosine_scaled_reward": 0.1460113013163209, "rewards/format_reward": 0.6041666865348816, "step": 184 }, { "completion_length": 2972.6250610351562, "epoch": 0.21142857142857144, "grad_norm": 0.5775122046470642, "kl": 0.25244140625, "learning_rate": 8.145033635316128e-07, "loss": 0.0701, "reward": 0.1199110560119152, "reward_std": 0.8271754533052444, "rewards/cosine_scaled_reward": -0.11712781526148319, "rewards/format_reward": 0.3541666679084301, "step": 185 }, { "completion_length": 2840.1875610351562, "epoch": 0.21257142857142858, "grad_norm": 0.3676423728466034, "kl": 0.2158203125, "learning_rate": 8.119553365707802e-07, "loss": 0.059, "reward": 0.5942272543907166, "reward_std": 0.7698107957839966, "rewards/cosine_scaled_reward": 0.057530272752046585, "rewards/format_reward": 0.4791666716337204, "step": 186 }, { "completion_length": 2704.8333740234375, "epoch": 0.21371428571428572, "grad_norm": 0.3322462737560272, "kl": 0.2138671875, "learning_rate": 8.093945422764069e-07, "loss": 0.0419, "reward": 0.4956296235322952, "reward_std": 0.7072524651885033, "rewards/cosine_scaled_reward": 0.10198147594928741, "rewards/format_reward": 0.29166668094694614, "step": 187 }, { "completion_length": 2780.729248046875, "epoch": 0.21485714285714286, "grad_norm": 0.3984168469905853, "kl": 0.288330078125, "learning_rate": 8.068211054579943e-07, "loss": 0.0474, "reward": 0.5863161403685808, "reward_std": 0.9082886129617691, "rewards/cosine_scaled_reward": -0.008925255388021469, "rewards/format_reward": 0.6041666865348816, "step": 188 }, { "completion_length": 2758.1459350585938, "epoch": 0.216, "grad_norm": 0.3293847143650055, "kl": 0.314453125, "learning_rate": 8.04235151541222e-07, "loss": 0.0399, "reward": 0.07589801587164402, "reward_std": 0.6460907310247421, "rewards/cosine_scaled_reward": -0.15996766556054354, "rewards/format_reward": 0.3958333358168602, "step": 189 }, { "completion_length": 3006.666748046875, "epoch": 0.21714285714285714, "grad_norm": 0.3320949971675873, "kl": 0.3447265625, "learning_rate": 8.01636806561836e-07, "loss": 0.0357, "reward": -0.11713236942887306, "reward_std": 0.6270528212189674, "rewards/cosine_scaled_reward": -0.20439952798187733, "rewards/format_reward": 0.29166667349636555, "step": 190 }, { "completion_length": 2770.479248046875, "epoch": 0.21828571428571428, "grad_norm": 0.7219541668891907, "kl": 0.30859375, "learning_rate": 7.990261971595048e-07, "loss": 0.084, "reward": 0.37447334453463554, "reward_std": 0.9116730242967606, "rewards/cosine_scaled_reward": -0.0002633389085531235, "rewards/format_reward": 0.3750000149011612, "step": 191 }, { "completion_length": 2974.6875610351562, "epoch": 0.21942857142857142, "grad_norm": 0.44086411595344543, "kl": 0.38525390625, "learning_rate": 7.964034505716476e-07, "loss": 0.0333, "reward": 0.2543896287679672, "reward_std": 0.9647316783666611, "rewards/cosine_scaled_reward": -0.06030518375337124, "rewards/format_reward": 0.37500001303851604, "step": 192 }, { "completion_length": 2625.5209350585938, "epoch": 0.22057142857142858, "grad_norm": 0.439861536026001, "kl": 0.33935546875, "learning_rate": 7.93768694627233e-07, "loss": 0.0154, "reward": -0.05579917132854462, "reward_std": 0.552303358912468, "rewards/cosine_scaled_reward": -0.19456627347972244, "rewards/format_reward": 0.33333334140479565, "step": 193 }, { "completion_length": 1735.7917175292969, "epoch": 0.22171428571428572, "grad_norm": 0.3492659032344818, "kl": 0.223876953125, "learning_rate": 7.911220577405484e-07, "loss": 0.0307, "reward": 1.0081698819994926, "reward_std": 1.0613654553890228, "rewards/cosine_scaled_reward": 0.10825158283114433, "rewards/format_reward": 0.7916667014360428, "step": 194 }, { "completion_length": 2374.9375610351562, "epoch": 0.22285714285714286, "grad_norm": 0.39783236384391785, "kl": 0.357666015625, "learning_rate": 7.884636689049422e-07, "loss": 0.0483, "reward": 0.519692053552717, "reward_std": 0.8805719166994095, "rewards/cosine_scaled_reward": -0.04223730321973562, "rewards/format_reward": 0.6041666716337204, "step": 195 }, { "completion_length": 2985.0208740234375, "epoch": 0.224, "grad_norm": 0.5228659510612488, "kl": 0.39453125, "learning_rate": 7.857936576865356e-07, "loss": 0.0626, "reward": 0.31096921616699547, "reward_std": 0.9736936837434769, "rewards/cosine_scaled_reward": -0.07368208467960358, "rewards/format_reward": 0.45833334140479565, "step": 196 }, { "completion_length": 3063.1459045410156, "epoch": 0.22514285714285714, "grad_norm": 0.4522063732147217, "kl": 0.39892578125, "learning_rate": 7.831121542179086e-07, "loss": 0.047, "reward": -0.027099967002868652, "reward_std": 0.7299272418022156, "rewards/cosine_scaled_reward": -0.14896666258573532, "rewards/format_reward": 0.27083334140479565, "step": 197 }, { "completion_length": 3130.291748046875, "epoch": 0.22628571428571428, "grad_norm": 0.7724531888961792, "kl": 0.40771484375, "learning_rate": 7.804192891917571e-07, "loss": 0.0823, "reward": 0.07385630160570145, "reward_std": 0.7986228317022324, "rewards/cosine_scaled_reward": -0.18182185851037502, "rewards/format_reward": 0.4375000074505806, "step": 198 }, { "completion_length": 2927.9375610351562, "epoch": 0.22742857142857142, "grad_norm": 0.4081217050552368, "kl": 0.40234375, "learning_rate": 7.777151938545235e-07, "loss": 0.0405, "reward": 0.9421972185373306, "reward_std": 0.8113018572330475, "rewards/cosine_scaled_reward": 0.18984858132898808, "rewards/format_reward": 0.5625000149011612, "step": 199 }, { "completion_length": 2594.6875610351562, "epoch": 0.22857142857142856, "grad_norm": 1.1233628988265991, "kl": 0.4052734375, "learning_rate": 7.75e-07, "loss": 0.1087, "reward": 0.4042139081284404, "reward_std": 0.9797720313072205, "rewards/cosine_scaled_reward": -0.06872639432549477, "rewards/format_reward": 0.5416666772216558, "step": 200 }, { "completion_length": 2749.729217529297, "epoch": 0.2297142857142857, "grad_norm": 0.4544771611690521, "kl": 0.463134765625, "learning_rate": 7.72273839962904e-07, "loss": 0.0332, "reward": 0.049620624631643295, "reward_std": 0.6019374430179596, "rewards/cosine_scaled_reward": -0.18352303700521588, "rewards/format_reward": 0.41666667722165585, "step": 201 }, { "completion_length": 2291.3334350585938, "epoch": 0.23085714285714284, "grad_norm": 0.4469071328639984, "kl": 0.4375, "learning_rate": 7.695368466124296e-07, "loss": 0.0543, "reward": 0.33724231645464897, "reward_std": 0.6383469551801682, "rewards/cosine_scaled_reward": -0.10221217246726155, "rewards/format_reward": 0.5416666716337204, "step": 202 }, { "completion_length": 2692.416717529297, "epoch": 0.232, "grad_norm": 0.9959556460380554, "kl": 0.60302734375, "learning_rate": 7.667891533457718e-07, "loss": 0.0229, "reward": 0.5023867785930634, "reward_std": 0.8520723432302475, "rewards/cosine_scaled_reward": -0.009223278611898422, "rewards/format_reward": 0.520833358168602, "step": 203 }, { "completion_length": 2610.291748046875, "epoch": 0.23314285714285715, "grad_norm": 0.5574892163276672, "kl": 0.54150390625, "learning_rate": 7.640308940816239e-07, "loss": 0.0779, "reward": 0.668186828494072, "reward_std": 0.7796131670475006, "rewards/cosine_scaled_reward": 0.04242673283442855, "rewards/format_reward": 0.5833333488553762, "step": 204 }, { "completion_length": 3158.0625, "epoch": 0.2342857142857143, "grad_norm": 0.9340919256210327, "kl": 0.658203125, "learning_rate": 7.612622032536507e-07, "loss": 0.091, "reward": 0.36712072789669037, "reward_std": 1.0377983078360558, "rewards/cosine_scaled_reward": -0.014356307685375214, "rewards/format_reward": 0.3958333507180214, "step": 205 }, { "completion_length": 3241.7500610351562, "epoch": 0.23542857142857143, "grad_norm": 0.6677758097648621, "kl": 0.7197265625, "learning_rate": 7.584832158039378e-07, "loss": 0.0745, "reward": -0.032032303512096405, "reward_std": 0.7234849855303764, "rewards/cosine_scaled_reward": -0.15143282152712345, "rewards/format_reward": 0.27083334885537624, "step": 206 }, { "completion_length": 2452.1876220703125, "epoch": 0.23657142857142857, "grad_norm": 0.9905790090560913, "kl": 0.4033203125, "learning_rate": 7.556940671764124e-07, "loss": 0.0568, "reward": 0.8914177902042866, "reward_std": 0.8338152915239334, "rewards/cosine_scaled_reward": 0.0811255220323801, "rewards/format_reward": 0.7291666865348816, "step": 207 }, { "completion_length": 2867.4583740234375, "epoch": 0.2377142857142857, "grad_norm": 1.0818088054656982, "kl": 0.6904296875, "learning_rate": 7.528948933102438e-07, "loss": 0.0301, "reward": 0.22067961934953928, "reward_std": 0.46099015325307846, "rewards/cosine_scaled_reward": -0.17091020289808512, "rewards/format_reward": 0.5625000149011612, "step": 208 }, { "completion_length": 2549.354217529297, "epoch": 0.23885714285714285, "grad_norm": 0.5277766585350037, "kl": 0.5927734375, "learning_rate": 7.500858306332172e-07, "loss": 0.0733, "reward": 0.142703301506117, "reward_std": 0.7169675379991531, "rewards/cosine_scaled_reward": -0.26198170334100723, "rewards/format_reward": 0.6666666716337204, "step": 209 }, { "completion_length": 2013.7500610351562, "epoch": 0.24, "grad_norm": 0.610791027545929, "kl": 0.40966796875, "learning_rate": 7.472670160550848e-07, "loss": 0.0513, "reward": 0.6129203364253044, "reward_std": 0.8901711851358414, "rewards/cosine_scaled_reward": -0.026873177848756313, "rewards/format_reward": 0.6666666772216558, "step": 210 }, { "completion_length": 3397.9376220703125, "epoch": 0.24114285714285713, "grad_norm": 0.8708758354187012, "kl": 0.751953125, "learning_rate": 7.444385869608921e-07, "loss": 0.0628, "reward": -0.10053645074367523, "reward_std": 0.5338989198207855, "rewards/cosine_scaled_reward": -0.14401823794469237, "rewards/format_reward": 0.1875000074505806, "step": 211 }, { "completion_length": 2320.8334045410156, "epoch": 0.2422857142857143, "grad_norm": 0.8576116561889648, "kl": 0.481201171875, "learning_rate": 7.416006812042827e-07, "loss": 0.0179, "reward": 0.7511888779699802, "reward_std": 0.8285558968782425, "rewards/cosine_scaled_reward": 0.021427758038043976, "rewards/format_reward": 0.7083333432674408, "step": 212 }, { "completion_length": 3072.2084350585938, "epoch": 0.24342857142857144, "grad_norm": 0.7516844272613525, "kl": 0.6279296875, "learning_rate": 7.387534371007797e-07, "loss": 0.0663, "reward": 0.14471609145402908, "reward_std": 0.5673011243343353, "rewards/cosine_scaled_reward": -0.09430863708257675, "rewards/format_reward": 0.3333333395421505, "step": 213 }, { "completion_length": 3004.666748046875, "epoch": 0.24457142857142858, "grad_norm": 0.650104820728302, "kl": 0.49853515625, "learning_rate": 7.358969934210438e-07, "loss": 0.048, "reward": 0.38014761358499527, "reward_std": 0.6449386551976204, "rewards/cosine_scaled_reward": -0.05992620065808296, "rewards/format_reward": 0.5000000055879354, "step": 214 }, { "completion_length": 2997.1251220703125, "epoch": 0.24571428571428572, "grad_norm": 0.8768295049667358, "kl": 0.55859375, "learning_rate": 7.330314893841101e-07, "loss": 0.0617, "reward": 0.14181075803935528, "reward_std": 0.7453153133392334, "rewards/cosine_scaled_reward": -0.21034463122487068, "rewards/format_reward": 0.5625000149011612, "step": 215 }, { "completion_length": 3182.6250610351562, "epoch": 0.24685714285714286, "grad_norm": 0.5447856187820435, "kl": 0.52685546875, "learning_rate": 7.301570646506027e-07, "loss": 0.0435, "reward": -0.2610638588666916, "reward_std": 0.5414926931262016, "rewards/cosine_scaled_reward": -0.2451152689754963, "rewards/format_reward": 0.2291666679084301, "step": 216 }, { "completion_length": 2864.8334350585938, "epoch": 0.248, "grad_norm": 0.5242255330085754, "kl": 0.46875, "learning_rate": 7.27273859315928e-07, "loss": 0.0353, "reward": 0.28853584825992584, "reward_std": 0.5657162964344025, "rewards/cosine_scaled_reward": -0.11614875216037035, "rewards/format_reward": 0.520833358168602, "step": 217 }, { "completion_length": 2654.9583740234375, "epoch": 0.24914285714285714, "grad_norm": 0.9366975426673889, "kl": 0.392578125, "learning_rate": 7.243820139034464e-07, "loss": 0.0515, "reward": 0.3301328122615814, "reward_std": 0.7091851830482483, "rewards/cosine_scaled_reward": -0.04326693775783497, "rewards/format_reward": 0.4166666828095913, "step": 218 }, { "completion_length": 2303.854217529297, "epoch": 0.2502857142857143, "grad_norm": 1.7971564531326294, "kl": 0.3369140625, "learning_rate": 7.214816693576234e-07, "loss": 0.0794, "reward": 0.6591267697513103, "reward_std": 0.9642367362976074, "rewards/cosine_scaled_reward": 0.03789670951664448, "rewards/format_reward": 0.583333358168602, "step": 219 }, { "completion_length": 2634.2501220703125, "epoch": 0.25142857142857145, "grad_norm": 1.3504126071929932, "kl": 0.4423828125, "learning_rate": 7.185729670371604e-07, "loss": -0.0076, "reward": 0.41383227705955505, "reward_std": 0.64960727840662, "rewards/cosine_scaled_reward": -0.043083855882287025, "rewards/format_reward": 0.5000000204890966, "step": 220 }, { "completion_length": 2984.7709350585938, "epoch": 0.25257142857142856, "grad_norm": 0.9762473106384277, "kl": 0.4384765625, "learning_rate": 7.156560487081051e-07, "loss": 0.0725, "reward": 0.25423768046312034, "reward_std": 0.8094103336334229, "rewards/cosine_scaled_reward": -0.11246450617909431, "rewards/format_reward": 0.4791666716337204, "step": 221 }, { "completion_length": 3275.7500610351562, "epoch": 0.2537142857142857, "grad_norm": 0.37796396017074585, "kl": 0.533203125, "learning_rate": 7.127310565369415e-07, "loss": 0.0546, "reward": 0.08308765979018062, "reward_std": 0.6242133527994156, "rewards/cosine_scaled_reward": -0.18762284144759178, "rewards/format_reward": 0.4583333395421505, "step": 222 }, { "completion_length": 2415.2916870117188, "epoch": 0.25485714285714284, "grad_norm": 66.52708435058594, "kl": 19.72021484375, "learning_rate": 7.097981330836616e-07, "loss": 0.1598, "reward": 0.32358624786138535, "reward_std": 0.8794360756874084, "rewards/cosine_scaled_reward": -0.12987355142831802, "rewards/format_reward": 0.5833333507180214, "step": 223 }, { "completion_length": 2914.2084350585938, "epoch": 0.256, "grad_norm": 0.39709535241127014, "kl": 0.42919921875, "learning_rate": 7.068574212948169e-07, "loss": 0.026, "reward": 0.4726352207362652, "reward_std": 0.5715819150209427, "rewards/cosine_scaled_reward": -0.02409905381500721, "rewards/format_reward": 0.5208333432674408, "step": 224 }, { "completion_length": 2191.4584045410156, "epoch": 0.2571428571428571, "grad_norm": 1.4947963953018188, "kl": 0.361572265625, "learning_rate": 7.039090644965509e-07, "loss": 0.0904, "reward": 0.8724448978900909, "reward_std": 0.8835494965314865, "rewards/cosine_scaled_reward": 0.16538911685347557, "rewards/format_reward": 0.541666679084301, "step": 225 }, { "completion_length": 2944.8959350585938, "epoch": 0.2582857142857143, "grad_norm": 0.8030902147293091, "kl": 0.5966796875, "learning_rate": 7.009532063876148e-07, "loss": 0.0354, "reward": 0.16449306067079306, "reward_std": 0.7553341090679169, "rewards/cosine_scaled_reward": -0.14692013710737228, "rewards/format_reward": 0.4583333432674408, "step": 226 }, { "completion_length": 2399.1251220703125, "epoch": 0.25942857142857145, "grad_norm": 0.6294677257537842, "kl": 0.40478515625, "learning_rate": 6.979899910323624e-07, "loss": 0.0385, "reward": 0.6515897959470749, "reward_std": 0.7883607298135757, "rewards/cosine_scaled_reward": -0.01795511320233345, "rewards/format_reward": 0.6875000149011612, "step": 227 }, { "completion_length": 2944.2916870117188, "epoch": 0.26057142857142856, "grad_norm": 0.7098054885864258, "kl": 0.5126953125, "learning_rate": 6.950195628537299e-07, "loss": 0.0537, "reward": 0.2890019528567791, "reward_std": 0.8232990577816963, "rewards/cosine_scaled_reward": -0.1367490328848362, "rewards/format_reward": 0.5625, "step": 228 }, { "completion_length": 2747.541748046875, "epoch": 0.26171428571428573, "grad_norm": 0.3639421761035919, "kl": 0.53759765625, "learning_rate": 6.920420666261961e-07, "loss": 0.0462, "reward": 0.1284541985951364, "reward_std": 0.6105376034975052, "rewards/cosine_scaled_reward": -0.21702291443943977, "rewards/format_reward": 0.5625000149011612, "step": 229 }, { "completion_length": 2547.916748046875, "epoch": 0.26285714285714284, "grad_norm": 0.7889376878738403, "kl": 0.4453125, "learning_rate": 6.890576474687263e-07, "loss": 0.0666, "reward": 0.46958625549450517, "reward_std": 0.8848246484994888, "rewards/cosine_scaled_reward": 0.03687644610181451, "rewards/format_reward": 0.3958333395421505, "step": 230 }, { "completion_length": 2979.3125610351562, "epoch": 0.264, "grad_norm": 0.49910208582878113, "kl": 0.56689453125, "learning_rate": 6.860664508377001e-07, "loss": 0.0658, "reward": 0.34871126525104046, "reward_std": 0.7629459947347641, "rewards/cosine_scaled_reward": -0.0756443589925766, "rewards/format_reward": 0.5000000149011612, "step": 231 }, { "completion_length": 2503.5625610351562, "epoch": 0.2651428571428571, "grad_norm": 0.8284872174263, "kl": 0.412109375, "learning_rate": 6.83068622519821e-07, "loss": 0.0204, "reward": 0.6350362692028284, "reward_std": 1.1135509312152863, "rewards/cosine_scaled_reward": -0.02623187005519867, "rewards/format_reward": 0.6875000149011612, "step": 232 }, { "completion_length": 2727.8751220703125, "epoch": 0.2662857142857143, "grad_norm": 0.5221201181411743, "kl": 0.4931640625, "learning_rate": 6.800643086250121e-07, "loss": 0.0615, "reward": 0.4846220053732395, "reward_std": 0.7716068103909492, "rewards/cosine_scaled_reward": -0.049355676397681236, "rewards/format_reward": 0.5833333432674408, "step": 233 }, { "completion_length": 2544.7084350585938, "epoch": 0.2674285714285714, "grad_norm": 1.3812953233718872, "kl": 0.498046875, "learning_rate": 6.770536555792944e-07, "loss": -0.0119, "reward": 0.4157133437693119, "reward_std": 0.7185128927230835, "rewards/cosine_scaled_reward": -0.13589332532137632, "rewards/format_reward": 0.6875000149011612, "step": 234 }, { "completion_length": 2495.375045776367, "epoch": 0.26857142857142857, "grad_norm": 0.6437314748764038, "kl": 0.59716796875, "learning_rate": 6.740368101176495e-07, "loss": 0.0412, "reward": 0.5019040778279305, "reward_std": 0.6978631764650345, "rewards/cosine_scaled_reward": -0.019881299696862698, "rewards/format_reward": 0.5416666753590107, "step": 235 }, { "completion_length": 2483.3959350585938, "epoch": 0.26971428571428574, "grad_norm": 0.3919011950492859, "kl": 0.4892578125, "learning_rate": 6.710139192768694e-07, "loss": 0.0482, "reward": 0.2438975148834288, "reward_std": 0.648132249712944, "rewards/cosine_scaled_reward": -0.2009679153561592, "rewards/format_reward": 0.6458333656191826, "step": 236 }, { "completion_length": 2204.4584350585938, "epoch": 0.27085714285714285, "grad_norm": 0.8478395342826843, "kl": 0.39111328125, "learning_rate": 6.679851303883891e-07, "loss": 0.0545, "reward": 0.42290161666460335, "reward_std": 0.648314818739891, "rewards/cosine_scaled_reward": -0.06979918852448463, "rewards/format_reward": 0.5625000149011612, "step": 237 }, { "completion_length": 2635.062530517578, "epoch": 0.272, "grad_norm": 1.0054919719696045, "kl": 0.572265625, "learning_rate": 6.649505910711058e-07, "loss": 0.0721, "reward": 0.5835281796753407, "reward_std": 0.7386454343795776, "rewards/cosine_scaled_reward": -0.09365258179605007, "rewards/format_reward": 0.770833358168602, "step": 238 }, { "completion_length": 3080.6875610351562, "epoch": 0.27314285714285713, "grad_norm": 0.8045799136161804, "kl": 0.7119140625, "learning_rate": 6.619104492241847e-07, "loss": 0.0514, "reward": 0.16217913012951612, "reward_std": 0.8966347873210907, "rewards/cosine_scaled_reward": -0.1376604586839676, "rewards/format_reward": 0.43750000558793545, "step": 239 }, { "completion_length": 2219.5000915527344, "epoch": 0.2742857142857143, "grad_norm": 1.3121085166931152, "kl": 0.403564453125, "learning_rate": 6.588648530198504e-07, "loss": 0.0747, "reward": 1.15125173330307, "reward_std": 0.957096055150032, "rewards/cosine_scaled_reward": 0.200625860132277, "rewards/format_reward": 0.7500000149011612, "step": 240 }, { "completion_length": 2043.0625610351562, "epoch": 0.2754285714285714, "grad_norm": 0.6292615532875061, "kl": 0.319580078125, "learning_rate": 6.558139508961654e-07, "loss": 0.0002, "reward": 0.9169554859399796, "reward_std": 0.5727524533867836, "rewards/cosine_scaled_reward": 0.07306107506155968, "rewards/format_reward": 0.770833358168602, "step": 241 }, { "completion_length": 2313.4583740234375, "epoch": 0.2765714285714286, "grad_norm": 0.6727687120437622, "kl": 0.4599609375, "learning_rate": 6.527578915497951e-07, "loss": 0.0547, "reward": 0.634780153632164, "reward_std": 0.7665407210588455, "rewards/cosine_scaled_reward": -0.0992765948176384, "rewards/format_reward": 0.8333333432674408, "step": 242 }, { "completion_length": 2128.354248046875, "epoch": 0.2777142857142857, "grad_norm": 2.353132963180542, "kl": 0.46875, "learning_rate": 6.496968239287603e-07, "loss": -0.0288, "reward": 0.7288870755583048, "reward_std": 0.7078111618757248, "rewards/cosine_scaled_reward": -0.020973138511180878, "rewards/format_reward": 0.770833358168602, "step": 243 }, { "completion_length": 2385.729217529297, "epoch": 0.27885714285714286, "grad_norm": 0.797772228717804, "kl": 0.435546875, "learning_rate": 6.466308972251785e-07, "loss": 0.0694, "reward": 0.9379732981324196, "reward_std": 0.76512710750103, "rewards/cosine_scaled_reward": 0.08356995694339275, "rewards/format_reward": 0.770833358168602, "step": 244 }, { "completion_length": 2175.229217529297, "epoch": 0.28, "grad_norm": 0.4513607621192932, "kl": 0.4609375, "learning_rate": 6.435602608679916e-07, "loss": 0.0361, "reward": 0.7639665333554149, "reward_std": 0.5898980349302292, "rewards/cosine_scaled_reward": -0.0034334324300289154, "rewards/format_reward": 0.770833358168602, "step": 245 }, { "completion_length": 2385.3959045410156, "epoch": 0.28114285714285714, "grad_norm": 1.354136347770691, "kl": 0.4619140625, "learning_rate": 6.404850645156841e-07, "loss": -0.0114, "reward": 0.5757800415158272, "reward_std": 0.4861333817243576, "rewards/cosine_scaled_reward": -0.09752664715051651, "rewards/format_reward": 0.7708333432674408, "step": 246 }, { "completion_length": 1984.0834045410156, "epoch": 0.2822857142857143, "grad_norm": 0.7202406525611877, "kl": 0.39306640625, "learning_rate": 6.374054580489873e-07, "loss": -0.0064, "reward": 0.7016956266015768, "reward_std": 0.6964651569724083, "rewards/cosine_scaled_reward": -0.03456886112689972, "rewards/format_reward": 0.770833358168602, "step": 247 }, { "completion_length": 2431.6250610351562, "epoch": 0.2834285714285714, "grad_norm": 1.12034273147583, "kl": 0.375, "learning_rate": 6.343215915635761e-07, "loss": 0.0399, "reward": 0.47921356186270714, "reward_std": 0.7437918186187744, "rewards/cosine_scaled_reward": -0.10414323909208179, "rewards/format_reward": 0.6875000149011612, "step": 248 }, { "completion_length": 2641.5000610351562, "epoch": 0.2845714285714286, "grad_norm": 1.147722601890564, "kl": 0.466796875, "learning_rate": 6.31233615362752e-07, "loss": 0.0084, "reward": 0.4995560571551323, "reward_std": 0.7342625856399536, "rewards/cosine_scaled_reward": -0.04188864305615425, "rewards/format_reward": 0.583333358168602, "step": 249 }, { "completion_length": 2112.0209045410156, "epoch": 0.2857142857142857, "grad_norm": 0.6532469987869263, "kl": 0.302490234375, "learning_rate": 6.281416799501187e-07, "loss": 0.0292, "reward": 0.6722276238724589, "reward_std": 1.072887122631073, "rewards/cosine_scaled_reward": 0.03403047751635313, "rewards/format_reward": 0.6041666865348816, "step": 250 }, { "completion_length": 2693.416748046875, "epoch": 0.28685714285714287, "grad_norm": 0.9663844108581543, "kl": 0.419921875, "learning_rate": 6.25045936022246e-07, "loss": 0.0569, "reward": 0.9957753866910934, "reward_std": 0.9329462796449661, "rewards/cosine_scaled_reward": 0.16455435939133167, "rewards/format_reward": 0.6666666865348816, "step": 251 }, { "completion_length": 2677.7709350585938, "epoch": 0.288, "grad_norm": 0.720365583896637, "kl": 0.42138671875, "learning_rate": 6.219465344613258e-07, "loss": 0.0584, "reward": 0.791405975818634, "reward_std": 0.8207461088895798, "rewards/cosine_scaled_reward": -0.010547026991844177, "rewards/format_reward": 0.8125000298023224, "step": 252 }, { "completion_length": 2163.2709350585938, "epoch": 0.28914285714285715, "grad_norm": 0.9754706025123596, "kl": 0.333984375, "learning_rate": 6.188436263278172e-07, "loss": -0.0284, "reward": 0.4755242392420769, "reward_std": 0.9357906579971313, "rewards/cosine_scaled_reward": -0.05390455946326256, "rewards/format_reward": 0.583333358168602, "step": 253 }, { "completion_length": 2022.0834045410156, "epoch": 0.29028571428571426, "grad_norm": 0.7189564108848572, "kl": 0.29931640625, "learning_rate": 6.157373628530852e-07, "loss": -0.0134, "reward": 1.0547878816723824, "reward_std": 0.6990637332201004, "rewards/cosine_scaled_reward": 0.162810567766428, "rewards/format_reward": 0.7291666865348816, "step": 254 }, { "completion_length": 2299.6041870117188, "epoch": 0.2914285714285714, "grad_norm": 0.6565377712249756, "kl": 0.3150634765625, "learning_rate": 6.126278954320294e-07, "loss": 0.0082, "reward": 0.9156973995268345, "reward_std": 0.7535882145166397, "rewards/cosine_scaled_reward": 0.08284871588693932, "rewards/format_reward": 0.7500000149011612, "step": 255 }, { "completion_length": 1579.4583892822266, "epoch": 0.2925714285714286, "grad_norm": 0.25218111276626587, "kl": 0.13134765625, "learning_rate": 6.095153756157051e-07, "loss": -0.0037, "reward": 0.6594964060932398, "reward_std": 0.7463338524103165, "rewards/cosine_scaled_reward": -0.04525182023644447, "rewards/format_reward": 0.7500000149011612, "step": 256 }, { "completion_length": 2658.6876220703125, "epoch": 0.2937142857142857, "grad_norm": 0.475395530462265, "kl": 0.332275390625, "learning_rate": 6.06399955103937e-07, "loss": 0.0439, "reward": 0.4807323142886162, "reward_std": 0.7335182875394821, "rewards/cosine_scaled_reward": -0.1450505219399929, "rewards/format_reward": 0.7708333432674408, "step": 257 }, { "completion_length": 2290.5000610351562, "epoch": 0.2948571428571429, "grad_norm": 0.5613760948181152, "kl": 0.2305908203125, "learning_rate": 6.032817857379256e-07, "loss": 0.0305, "reward": 0.5192163055762649, "reward_std": 0.7799556702375412, "rewards/cosine_scaled_reward": -0.021641843486577272, "rewards/format_reward": 0.5625000149011612, "step": 258 }, { "completion_length": 2217.8959350585938, "epoch": 0.296, "grad_norm": 1.199144959449768, "kl": 0.24127197265625, "learning_rate": 6.001610194928464e-07, "loss": 0.049, "reward": 0.5793692320585251, "reward_std": 0.7019505053758621, "rewards/cosine_scaled_reward": -0.09573205607011914, "rewards/format_reward": 0.7708333432674408, "step": 259 }, { "completion_length": 2786.041748046875, "epoch": 0.29714285714285715, "grad_norm": 0.7002319693565369, "kl": 0.292236328125, "learning_rate": 5.97037808470444e-07, "loss": 0.0086, "reward": 0.5236682705581188, "reward_std": 0.5017373934388161, "rewards/cosine_scaled_reward": -0.10274921730160713, "rewards/format_reward": 0.7291666865348816, "step": 260 }, { "completion_length": 2164.291717529297, "epoch": 0.29828571428571427, "grad_norm": 0.2812724709510803, "kl": 0.186279296875, "learning_rate": 5.939123048916173e-07, "loss": 0.0171, "reward": 0.6918911039829254, "reward_std": 0.4820164740085602, "rewards/cosine_scaled_reward": -0.07072112709283829, "rewards/format_reward": 0.8333333432674408, "step": 261 }, { "completion_length": 2519.5625610351562, "epoch": 0.29942857142857143, "grad_norm": 0.4466201663017273, "kl": 0.25927734375, "learning_rate": 5.907846610890011e-07, "loss": 0.0037, "reward": 0.45665838569402695, "reward_std": 0.7808536291122437, "rewards/cosine_scaled_reward": -0.14667082950472832, "rewards/format_reward": 0.7500000149011612, "step": 262 }, { "completion_length": 2283.0208435058594, "epoch": 0.30057142857142854, "grad_norm": 0.9734614491462708, "kl": 0.24072265625, "learning_rate": 5.87655029499542e-07, "loss": -0.0445, "reward": 0.6200529932975769, "reward_std": 0.9734015464782715, "rewards/cosine_scaled_reward": -0.05455685779452324, "rewards/format_reward": 0.7291666716337204, "step": 263 }, { "completion_length": 2269.729248046875, "epoch": 0.3017142857142857, "grad_norm": 0.93758225440979, "kl": 0.242919921875, "learning_rate": 5.845235626570683e-07, "loss": 0.0552, "reward": 0.5712921991944313, "reward_std": 0.6152775660157204, "rewards/cosine_scaled_reward": -0.0789372380822897, "rewards/format_reward": 0.7291666865348816, "step": 264 }, { "completion_length": 2714.729248046875, "epoch": 0.3028571428571429, "grad_norm": 0.4690639078617096, "kl": 0.28564453125, "learning_rate": 5.813904131848564e-07, "loss": 0.0054, "reward": 0.33216356858611107, "reward_std": 0.5296753197908401, "rewards/cosine_scaled_reward": -0.11516822502017021, "rewards/format_reward": 0.5625000223517418, "step": 265 }, { "completion_length": 2834.8750610351562, "epoch": 0.304, "grad_norm": 0.6644603610038757, "kl": 0.278076171875, "learning_rate": 5.78255733788191e-07, "loss": 0.0086, "reward": 0.7553704380989075, "reward_std": 0.6663154512643814, "rewards/cosine_scaled_reward": -0.059814791195094585, "rewards/format_reward": 0.8750000149011612, "step": 266 }, { "completion_length": 2623.7291870117188, "epoch": 0.30514285714285716, "grad_norm": 0.4014008343219757, "kl": 0.30078125, "learning_rate": 5.751196772469237e-07, "loss": 0.0276, "reward": 0.574170459061861, "reward_std": 0.6768613308668137, "rewards/cosine_scaled_reward": -0.046248115599155426, "rewards/format_reward": 0.6666666865348816, "step": 267 }, { "completion_length": 2934.2916870117188, "epoch": 0.3062857142857143, "grad_norm": 0.32006382942199707, "kl": 0.24169921875, "learning_rate": 5.71982396408026e-07, "loss": 0.0186, "reward": 0.5890230983495712, "reward_std": 0.6336611211299896, "rewards/cosine_scaled_reward": -0.0388217861764133, "rewards/format_reward": 0.666666679084301, "step": 268 }, { "completion_length": 2591.6459350585938, "epoch": 0.30742857142857144, "grad_norm": 0.2750188410282135, "kl": 0.2086181640625, "learning_rate": 5.688440441781398e-07, "loss": 0.0096, "reward": 0.4631531648337841, "reward_std": 0.5730658769607544, "rewards/cosine_scaled_reward": -0.15384008269757032, "rewards/format_reward": 0.7708333432674408, "step": 269 }, { "completion_length": 1949.6250305175781, "epoch": 0.30857142857142855, "grad_norm": 0.3348838686943054, "kl": 0.14434814453125, "learning_rate": 5.657047735161255e-07, "loss": 0.0019, "reward": 1.0058863386511803, "reward_std": 0.6113419234752655, "rewards/cosine_scaled_reward": 0.05502649489790201, "rewards/format_reward": 0.895833358168602, "step": 270 }, { "completion_length": 2595.8333740234375, "epoch": 0.3097142857142857, "grad_norm": 0.3792303502559662, "kl": 0.18743896484375, "learning_rate": 5.625647374256061e-07, "loss": 0.0156, "reward": 1.184450313448906, "reward_std": 0.6347895562648773, "rewards/cosine_scaled_reward": 0.18597513809800148, "rewards/format_reward": 0.8125000149011612, "step": 271 }, { "completion_length": 3300.4583740234375, "epoch": 0.31085714285714283, "grad_norm": 0.4754711091518402, "kl": 0.2998046875, "learning_rate": 5.594240889475106e-07, "loss": 0.0323, "reward": 0.33772575482726097, "reward_std": 0.7981042563915253, "rewards/cosine_scaled_reward": -0.12280379980802536, "rewards/format_reward": 0.583333358168602, "step": 272 }, { "completion_length": 2806.5208740234375, "epoch": 0.312, "grad_norm": 0.2589206397533417, "kl": 0.203857421875, "learning_rate": 5.562829811526154e-07, "loss": 0.0018, "reward": 0.4326868951320648, "reward_std": 0.6429417282342911, "rewards/cosine_scaled_reward": -0.1378232277929783, "rewards/format_reward": 0.7083333432674408, "step": 273 }, { "completion_length": 2775.6043090820312, "epoch": 0.31314285714285717, "grad_norm": 0.392734557390213, "kl": 0.182861328125, "learning_rate": 5.531415671340826e-07, "loss": 0.0352, "reward": 0.39707405120134354, "reward_std": 0.748130202293396, "rewards/cosine_scaled_reward": -0.11396298557519913, "rewards/format_reward": 0.6250000223517418, "step": 274 }, { "completion_length": 2929.979278564453, "epoch": 0.3142857142857143, "grad_norm": 0.700515627861023, "kl": 0.240478515625, "learning_rate": 5.5e-07, "loss": 0.0581, "reward": 0.3950451835989952, "reward_std": 0.9513901323080063, "rewards/cosine_scaled_reward": -0.06289407718577422, "rewards/format_reward": 0.5208333432674408, "step": 275 }, { "completion_length": 2392.2291870117188, "epoch": 0.31542857142857145, "grad_norm": 0.6831299066543579, "kl": 0.146484375, "learning_rate": 5.468584328659172e-07, "loss": 0.0305, "reward": 0.8106965273618698, "reward_std": 0.8061726838350296, "rewards/cosine_scaled_reward": -0.011318429373204708, "rewards/format_reward": 0.8333333432674408, "step": 276 }, { "completion_length": 2669.7709350585938, "epoch": 0.31657142857142856, "grad_norm": 1.2274115085601807, "kl": 0.221435546875, "learning_rate": 5.437170188473847e-07, "loss": 0.0847, "reward": 0.44736091792583466, "reward_std": 0.8726006895303726, "rewards/cosine_scaled_reward": -0.09923620894551277, "rewards/format_reward": 0.6458333432674408, "step": 277 }, { "completion_length": 2377.7500915527344, "epoch": 0.3177142857142857, "grad_norm": 0.6143187284469604, "kl": 0.225341796875, "learning_rate": 5.405759110524894e-07, "loss": 0.0193, "reward": 0.5976903513073921, "reward_std": 0.974912166595459, "rewards/cosine_scaled_reward": -0.0032381737837567925, "rewards/format_reward": 0.6041666865348816, "step": 278 }, { "completion_length": 2511.2083740234375, "epoch": 0.31885714285714284, "grad_norm": 0.7699910998344421, "kl": 0.2982177734375, "learning_rate": 5.37435262574394e-07, "loss": 0.0299, "reward": 0.3957599774003029, "reward_std": 0.8634193539619446, "rewards/cosine_scaled_reward": -0.10420336201786995, "rewards/format_reward": 0.604166679084301, "step": 279 }, { "completion_length": 2782.8750610351562, "epoch": 0.32, "grad_norm": 0.9926307201385498, "kl": 0.310791015625, "learning_rate": 5.342952264838747e-07, "loss": 0.0676, "reward": 0.6104128423612565, "reward_std": 0.8384141325950623, "rewards/cosine_scaled_reward": -0.028126917779445648, "rewards/format_reward": 0.6666666865348816, "step": 280 }, { "completion_length": 2380.4584350585938, "epoch": 0.3211428571428571, "grad_norm": 0.883975088596344, "kl": 0.2364501953125, "learning_rate": 5.311559558218603e-07, "loss": -0.0298, "reward": 0.6390588581562042, "reward_std": 0.7505539357662201, "rewards/cosine_scaled_reward": -0.055470582097768784, "rewards/format_reward": 0.7500000074505806, "step": 281 }, { "completion_length": 2751.1666870117188, "epoch": 0.3222857142857143, "grad_norm": 0.6628551483154297, "kl": 0.340087890625, "learning_rate": 5.28017603591974e-07, "loss": 0.0545, "reward": 0.8024181574583054, "reward_std": 0.9694567918777466, "rewards/cosine_scaled_reward": 0.026209060102701187, "rewards/format_reward": 0.7500000298023224, "step": 282 }, { "completion_length": 2520.1458740234375, "epoch": 0.32342857142857145, "grad_norm": 0.5402534604072571, "kl": 0.352783203125, "learning_rate": 5.248803227530763e-07, "loss": 0.0129, "reward": 0.4531768709421158, "reward_std": 0.6381779089570045, "rewards/cosine_scaled_reward": -0.1588282436132431, "rewards/format_reward": 0.770833358168602, "step": 283 }, { "completion_length": 2361.5625, "epoch": 0.32457142857142857, "grad_norm": 0.7840125560760498, "kl": 0.43798828125, "learning_rate": 5.21744266211809e-07, "loss": 0.0189, "reward": 0.3853081315755844, "reward_std": 0.7855608388781548, "rewards/cosine_scaled_reward": -0.07817927654832602, "rewards/format_reward": 0.5416666716337204, "step": 284 }, { "completion_length": 2909.3751220703125, "epoch": 0.32571428571428573, "grad_norm": 0.543645441532135, "kl": 0.51806640625, "learning_rate": 5.186095868151436e-07, "loss": 0.059, "reward": 0.0715614715591073, "reward_std": 0.6991735994815826, "rewards/cosine_scaled_reward": -0.22463593445718288, "rewards/format_reward": 0.5208333358168602, "step": 285 }, { "completion_length": 2623.5000610351562, "epoch": 0.32685714285714285, "grad_norm": 1.0876595973968506, "kl": 0.3642578125, "learning_rate": 5.154764373429315e-07, "loss": 0.0895, "reward": 0.7619921118021011, "reward_std": 1.0285737365484238, "rewards/cosine_scaled_reward": 0.047662717290222645, "rewards/format_reward": 0.6666666865348816, "step": 286 }, { "completion_length": 2762.666748046875, "epoch": 0.328, "grad_norm": 0.7187138795852661, "kl": 0.50048828125, "learning_rate": 5.123449705004581e-07, "loss": 0.043, "reward": 0.5433498155325651, "reward_std": 0.6913661956787109, "rewards/cosine_scaled_reward": -0.061658430844545364, "rewards/format_reward": 0.6666666865348816, "step": 287 }, { "completion_length": 2270.8333740234375, "epoch": 0.3291428571428571, "grad_norm": 0.34955894947052, "kl": 0.260986328125, "learning_rate": 5.09215338910999e-07, "loss": 0.019, "reward": 0.9035947173833847, "reward_std": 0.8012775778770447, "rewards/cosine_scaled_reward": -0.006535984575748444, "rewards/format_reward": 0.9166666865348816, "step": 288 }, { "completion_length": 2480.8541870117188, "epoch": 0.3302857142857143, "grad_norm": 1.0728695392608643, "kl": 0.474609375, "learning_rate": 5.060876951083828e-07, "loss": 0.0877, "reward": 0.5563938245177269, "reward_std": 0.8119515627622604, "rewards/cosine_scaled_reward": -0.06555308337556198, "rewards/format_reward": 0.6875000223517418, "step": 289 }, { "completion_length": 2005.3542175292969, "epoch": 0.3314285714285714, "grad_norm": 2.5518229007720947, "kl": 0.4202880859375, "learning_rate": 5.02962191529556e-07, "loss": 0.1377, "reward": 1.0121518671512604, "reward_std": 1.0199929028749466, "rewards/cosine_scaled_reward": 0.14149258099496365, "rewards/format_reward": 0.7291666865348816, "step": 290 }, { "completion_length": 1837.25, "epoch": 0.3325714285714286, "grad_norm": 0.5082411766052246, "kl": 0.318115234375, "learning_rate": 4.998389805071536e-07, "loss": -0.0025, "reward": 0.5244562700390816, "reward_std": 0.8083207458257675, "rewards/cosine_scaled_reward": -0.09193855058401823, "rewards/format_reward": 0.7083333432674408, "step": 291 }, { "completion_length": 2516.000030517578, "epoch": 0.33371428571428574, "grad_norm": 0.6963807344436646, "kl": 0.496826171875, "learning_rate": 4.967182142620745e-07, "loss": 0.0554, "reward": 0.6148294545710087, "reward_std": 0.7742474526166916, "rewards/cosine_scaled_reward": -0.025918614119291306, "rewards/format_reward": 0.666666679084301, "step": 292 }, { "completion_length": 2563.354248046875, "epoch": 0.33485714285714285, "grad_norm": 0.4553970992565155, "kl": 0.64111328125, "learning_rate": 4.93600044896063e-07, "loss": 0.08, "reward": 0.4226888967677951, "reward_std": 0.8445644974708557, "rewards/cosine_scaled_reward": -0.12198889185674489, "rewards/format_reward": 0.666666679084301, "step": 293 }, { "completion_length": 2474.6459350585938, "epoch": 0.336, "grad_norm": 0.5785382390022278, "kl": 0.543212890625, "learning_rate": 4.904846243842949e-07, "loss": 0.0498, "reward": 0.7478385232388973, "reward_std": 0.7380570024251938, "rewards/cosine_scaled_reward": 0.08225257322192192, "rewards/format_reward": 0.5833333432674408, "step": 294 }, { "completion_length": 2818.1043090820312, "epoch": 0.33714285714285713, "grad_norm": 1.9920473098754883, "kl": 1.005859375, "learning_rate": 4.873721045679706e-07, "loss": 0.0599, "reward": 0.38695642724633217, "reward_std": 0.8360127806663513, "rewards/cosine_scaled_reward": -0.0461051338352263, "rewards/format_reward": 0.4791666716337204, "step": 295 }, { "completion_length": 2180.6875610351562, "epoch": 0.3382857142857143, "grad_norm": 1.0185471773147583, "kl": 0.60888671875, "learning_rate": 4.842626371469149e-07, "loss": 0.0929, "reward": 0.9686335474252701, "reward_std": 0.9049602597951889, "rewards/cosine_scaled_reward": 0.1405667569488287, "rewards/format_reward": 0.6875000223517418, "step": 296 }, { "completion_length": 2705.2709350585938, "epoch": 0.3394285714285714, "grad_norm": 1.4574670791625977, "kl": 0.7529296875, "learning_rate": 4.811563736721829e-07, "loss": 0.0525, "reward": 0.3473209235817194, "reward_std": 0.7314907014369965, "rewards/cosine_scaled_reward": -0.12842286378145218, "rewards/format_reward": 0.6041666865348816, "step": 297 }, { "completion_length": 2661.5416870117188, "epoch": 0.3405714285714286, "grad_norm": 1.0324411392211914, "kl": 0.779296875, "learning_rate": 4.780534655386743e-07, "loss": 0.0626, "reward": 0.44023372419178486, "reward_std": 0.7127360999584198, "rewards/cosine_scaled_reward": -0.04029981233179569, "rewards/format_reward": 0.5208333507180214, "step": 298 }, { "completion_length": 2836.0626220703125, "epoch": 0.3417142857142857, "grad_norm": 1.2534230947494507, "kl": 0.66015625, "learning_rate": 4.749540639777539e-07, "loss": 0.0559, "reward": 0.4187684841454029, "reward_std": 0.7654632180929184, "rewards/cosine_scaled_reward": -0.1031157523393631, "rewards/format_reward": 0.6250000260770321, "step": 299 }, { "completion_length": 2193.541717529297, "epoch": 0.34285714285714286, "grad_norm": 1.023747444152832, "kl": 0.4393310546875, "learning_rate": 4.7185832004988133e-07, "loss": 0.0057, "reward": 0.7049860581755638, "reward_std": 0.8015492558479309, "rewards/cosine_scaled_reward": 0.07124301791191101, "rewards/format_reward": 0.5625000223517418, "step": 300 }, { "completion_length": 2034.166748046875, "epoch": 0.344, "grad_norm": 1.0728156566619873, "kl": 0.6123046875, "learning_rate": 4.68766384637248e-07, "loss": 0.0087, "reward": 0.5370926359901205, "reward_std": 0.8870838582515717, "rewards/cosine_scaled_reward": -0.05437035672366619, "rewards/format_reward": 0.645833358168602, "step": 301 }, { "completion_length": 1496.7708587646484, "epoch": 0.34514285714285714, "grad_norm": 0.36257851123809814, "kl": 0.46044921875, "learning_rate": 4.656784084364238e-07, "loss": -0.0228, "reward": 0.484084477648139, "reward_std": 0.7823295146226883, "rewards/cosine_scaled_reward": -0.01837443746626377, "rewards/format_reward": 0.5208333358168602, "step": 302 }, { "completion_length": 1376.9167175292969, "epoch": 0.3462857142857143, "grad_norm": 0.30551737546920776, "kl": 0.42236328125, "learning_rate": 4.6259454195101267e-07, "loss": -0.0461, "reward": 0.9217020869255066, "reward_std": 0.7940811067819595, "rewards/cosine_scaled_reward": 0.07543436251580715, "rewards/format_reward": 0.7708333432674408, "step": 303 }, { "completion_length": 1413.708396911621, "epoch": 0.3474285714285714, "grad_norm": 0.9130037426948547, "kl": 0.74609375, "learning_rate": 4.59514935484316e-07, "loss": -0.0368, "reward": 0.7251628190279007, "reward_std": 1.0211279392242432, "rewards/cosine_scaled_reward": 0.05008140648715198, "rewards/format_reward": 0.6250000074505806, "step": 304 }, { "completion_length": 1933.5208740234375, "epoch": 0.3485714285714286, "grad_norm": 0.6181937456130981, "kl": 0.59716796875, "learning_rate": 4.5643973913200837e-07, "loss": -0.0665, "reward": 0.6453933482989669, "reward_std": 0.8129071295261383, "rewards/cosine_scaled_reward": 0.03103000298142433, "rewards/format_reward": 0.5833333432674408, "step": 305 }, { "completion_length": 1331.7917098999023, "epoch": 0.3497142857142857, "grad_norm": 0.2622654139995575, "kl": 0.6375732421875, "learning_rate": 4.5336910277482155e-07, "loss": -0.0564, "reward": 0.4545041471719742, "reward_std": 0.6556018441915512, "rewards/cosine_scaled_reward": -0.08524793572723866, "rewards/format_reward": 0.6250000149011612, "step": 306 }, { "completion_length": 1522.2916870117188, "epoch": 0.35085714285714287, "grad_norm": 0.3843940198421478, "kl": 0.647705078125, "learning_rate": 4.503031760712397e-07, "loss": -0.0408, "reward": 0.9578620158135891, "reward_std": 0.9549144953489304, "rewards/cosine_scaled_reward": 0.15601433627307415, "rewards/format_reward": 0.645833358168602, "step": 307 }, { "completion_length": 2036.0834045410156, "epoch": 0.352, "grad_norm": 0.8481309413909912, "kl": 0.606689453125, "learning_rate": 4.4724210845020494e-07, "loss": -0.0199, "reward": 0.631169930100441, "reward_std": 0.7533179372549057, "rewards/cosine_scaled_reward": -0.028165025636553764, "rewards/format_reward": 0.6875000149011612, "step": 308 }, { "completion_length": 1487.1666870117188, "epoch": 0.35314285714285715, "grad_norm": 1.9852585792541504, "kl": 0.5830078125, "learning_rate": 4.441860491038345e-07, "loss": 0.0105, "reward": 0.7891280353069305, "reward_std": 0.8583121746778488, "rewards/cosine_scaled_reward": 0.07164734601974487, "rewards/format_reward": 0.645833358168602, "step": 309 }, { "completion_length": 1955.791748046875, "epoch": 0.35428571428571426, "grad_norm": 0.31575194001197815, "kl": 0.184326171875, "learning_rate": 4.4113514698014953e-07, "loss": -0.0014, "reward": 0.8256345121189952, "reward_std": 0.7062153369188309, "rewards/cosine_scaled_reward": 0.048233918845653534, "rewards/format_reward": 0.7291666716337204, "step": 310 }, { "completion_length": 1666.0833740234375, "epoch": 0.3554285714285714, "grad_norm": 2.016129970550537, "kl": 0.47119140625, "learning_rate": 4.3808955077581546e-07, "loss": 0.072, "reward": 0.8503673672676086, "reward_std": 0.8861262649297714, "rewards/cosine_scaled_reward": 0.08143368689343333, "rewards/format_reward": 0.6875000298023224, "step": 311 }, { "completion_length": 1778.1041870117188, "epoch": 0.3565714285714286, "grad_norm": 2.5336270332336426, "kl": 0.513916015625, "learning_rate": 4.350494089288943e-07, "loss": 0.0693, "reward": 0.5695639494806528, "reward_std": 0.7498121336102486, "rewards/cosine_scaled_reward": -0.038134701550006866, "rewards/format_reward": 0.645833358168602, "step": 312 }, { "completion_length": 2122.4791870117188, "epoch": 0.3577142857142857, "grad_norm": 0.3355765640735626, "kl": 0.609619140625, "learning_rate": 4.3201486961161093e-07, "loss": -0.0237, "reward": 0.7382938861846924, "reward_std": 0.8554851859807968, "rewards/cosine_scaled_reward": -0.005853069946169853, "rewards/format_reward": 0.7500000298023224, "step": 313 }, { "completion_length": 2387.2083435058594, "epoch": 0.3588571428571429, "grad_norm": 3.036442756652832, "kl": 0.231201171875, "learning_rate": 4.2898608072313045e-07, "loss": 0.1037, "reward": 0.8101449112291448, "reward_std": 0.963694229722023, "rewards/cosine_scaled_reward": 0.01965576596558094, "rewards/format_reward": 0.770833358168602, "step": 314 }, { "completion_length": 2170.729217529297, "epoch": 0.36, "grad_norm": 1.4392133951187134, "kl": 0.21209716796875, "learning_rate": 4.2596318988235037e-07, "loss": 0.045, "reward": 0.6554913818836212, "reward_std": 1.1266003251075745, "rewards/cosine_scaled_reward": -0.01600432489067316, "rewards/format_reward": 0.6875000149011612, "step": 315 }, { "completion_length": 2317.1459350585938, "epoch": 0.36114285714285715, "grad_norm": 0.4884386658668518, "kl": 0.36376953125, "learning_rate": 4.2294634442070553e-07, "loss": 0.0282, "reward": 0.29845087230205536, "reward_std": 0.6840033531188965, "rewards/cosine_scaled_reward": -0.15285790944471955, "rewards/format_reward": 0.6041666865348816, "step": 316 }, { "completion_length": 3088.2709350585938, "epoch": 0.36228571428571427, "grad_norm": 0.8027182817459106, "kl": 0.3505859375, "learning_rate": 4.1993569137498776e-07, "loss": 0.0242, "reward": 0.9088336080312729, "reward_std": 1.000715285539627, "rewards/cosine_scaled_reward": 0.1002501342445612, "rewards/format_reward": 0.7083333432674408, "step": 317 }, { "completion_length": 2317.3750610351562, "epoch": 0.36342857142857143, "grad_norm": 0.327318012714386, "kl": 0.3134765625, "learning_rate": 4.1693137748017915e-07, "loss": 0.0385, "reward": 0.6265020594000816, "reward_std": 0.7293453440070152, "rewards/cosine_scaled_reward": -0.040915639605373144, "rewards/format_reward": 0.7083333432674408, "step": 318 }, { "completion_length": 2849.3333740234375, "epoch": 0.36457142857142855, "grad_norm": 1.7290736436843872, "kl": 0.443359375, "learning_rate": 4.1393354916230005e-07, "loss": 0.098, "reward": 0.46177836135029793, "reward_std": 0.9352491050958633, "rewards/cosine_scaled_reward": -0.07119414396584034, "rewards/format_reward": 0.604166679084301, "step": 319 }, { "completion_length": 2402.8750610351562, "epoch": 0.3657142857142857, "grad_norm": 1.1702836751937866, "kl": 0.34814453125, "learning_rate": 4.1094235253127374e-07, "loss": 0.0587, "reward": 0.5764410048723221, "reward_std": 0.7314303368330002, "rewards/cosine_scaled_reward": -0.055529496632516384, "rewards/format_reward": 0.6875000149011612, "step": 320 }, { "completion_length": 2828.791748046875, "epoch": 0.3668571428571429, "grad_norm": 0.797664999961853, "kl": 0.52001953125, "learning_rate": 4.079579333738039e-07, "loss": 0.0412, "reward": 0.4816475547850132, "reward_std": 0.8193319886922836, "rewards/cosine_scaled_reward": -0.050842900411225855, "rewards/format_reward": 0.5833333432674408, "step": 321 }, { "completion_length": 2521.479248046875, "epoch": 0.368, "grad_norm": 1.1600196361541748, "kl": 0.3974609375, "learning_rate": 4.0498043714627006e-07, "loss": 0.024, "reward": 0.8539287596940994, "reward_std": 0.9238015562295914, "rewards/cosine_scaled_reward": 0.020714368554763496, "rewards/format_reward": 0.8125000149011612, "step": 322 }, { "completion_length": 2526.354278564453, "epoch": 0.36914285714285716, "grad_norm": 0.7439947128295898, "kl": 0.40966796875, "learning_rate": 4.020100089676376e-07, "loss": 0.0387, "reward": 0.9395965822041035, "reward_std": 0.7121690958738327, "rewards/cosine_scaled_reward": 0.0947982706129551, "rewards/format_reward": 0.7500000298023224, "step": 323 }, { "completion_length": 2963.6666870117188, "epoch": 0.3702857142857143, "grad_norm": 0.7919374108314514, "kl": 0.53271484375, "learning_rate": 3.9904679361238526e-07, "loss": 0.0574, "reward": 0.3954196572303772, "reward_std": 0.7907533347606659, "rewards/cosine_scaled_reward": -0.0939568355679512, "rewards/format_reward": 0.583333358168602, "step": 324 }, { "completion_length": 2059.416748046875, "epoch": 0.37142857142857144, "grad_norm": 0.7337906956672668, "kl": 0.30908203125, "learning_rate": 3.9609093550344907e-07, "loss": 0.0437, "reward": 0.6482492443174124, "reward_std": 0.976516529917717, "rewards/cosine_scaled_reward": -0.050875378074124455, "rewards/format_reward": 0.7500000149011612, "step": 325 }, { "completion_length": 2717.5001220703125, "epoch": 0.37257142857142855, "grad_norm": 0.7754512429237366, "kl": 0.4609375, "learning_rate": 3.931425787051832e-07, "loss": 0.0804, "reward": 0.5230683460831642, "reward_std": 0.7168317809700966, "rewards/cosine_scaled_reward": -0.09263250115327537, "rewards/format_reward": 0.708333358168602, "step": 326 }, { "completion_length": 2365.666732788086, "epoch": 0.3737142857142857, "grad_norm": 0.9611565470695496, "kl": 0.370513916015625, "learning_rate": 3.902018669163384e-07, "loss": 0.013, "reward": 0.8529483936727047, "reward_std": 0.787610650062561, "rewards/cosine_scaled_reward": 0.05147417262196541, "rewards/format_reward": 0.7500000149011612, "step": 327 }, { "completion_length": 2920.5000610351562, "epoch": 0.37485714285714283, "grad_norm": 1.1496500968933105, "kl": 0.568359375, "learning_rate": 3.872689434630585e-07, "loss": 0.1313, "reward": 0.5756548047065735, "reward_std": 1.1168714761734009, "rewards/cosine_scaled_reward": -0.04550594184547663, "rewards/format_reward": 0.6666666865348816, "step": 328 }, { "completion_length": 2694.229248046875, "epoch": 0.376, "grad_norm": 1.6449869871139526, "kl": 0.4189453125, "learning_rate": 3.843439512918949e-07, "loss": 0.0905, "reward": 0.607914388179779, "reward_std": 0.9643268138170242, "rewards/cosine_scaled_reward": -0.0918761616339907, "rewards/format_reward": 0.7916666865348816, "step": 329 }, { "completion_length": 2766.041748046875, "epoch": 0.37714285714285717, "grad_norm": 0.8693978190422058, "kl": 0.56396484375, "learning_rate": 3.8142703296283953e-07, "loss": 0.0817, "reward": 0.42995208874344826, "reward_std": 0.9052233844995499, "rewards/cosine_scaled_reward": -0.07669062539935112, "rewards/format_reward": 0.5833333432674408, "step": 330 }, { "completion_length": 2704.2084350585938, "epoch": 0.3782857142857143, "grad_norm": 0.6593329906463623, "kl": 0.43994140625, "learning_rate": 3.785183306423767e-07, "loss": 0.0481, "reward": 0.5416111797094345, "reward_std": 0.7576990574598312, "rewards/cosine_scaled_reward": -0.07294442504644394, "rewards/format_reward": 0.6875000149011612, "step": 331 }, { "completion_length": 2430.1458740234375, "epoch": 0.37942857142857145, "grad_norm": 1.1451934576034546, "kl": 0.4638671875, "learning_rate": 3.7561798609655373e-07, "loss": 0.0949, "reward": 0.9672386646270752, "reward_std": 0.9684969633817673, "rewards/cosine_scaled_reward": 0.13986931554973125, "rewards/format_reward": 0.6875000298023224, "step": 332 }, { "completion_length": 2586.3959045410156, "epoch": 0.38057142857142856, "grad_norm": 1.2027528285980225, "kl": 0.5546875, "learning_rate": 3.72726140684072e-07, "loss": 0.0376, "reward": 0.24384124111384153, "reward_std": 0.6339670419692993, "rewards/cosine_scaled_reward": -0.2218293957412243, "rewards/format_reward": 0.6875000149011612, "step": 333 }, { "completion_length": 2716.5208740234375, "epoch": 0.38171428571428573, "grad_norm": 0.5679751634597778, "kl": 0.46875, "learning_rate": 3.6984293534939737e-07, "loss": 0.0595, "reward": 0.49158087372779846, "reward_std": 0.6254527196288109, "rewards/cosine_scaled_reward": -0.07712622173130512, "rewards/format_reward": 0.645833358168602, "step": 334 }, { "completion_length": 2559.4583740234375, "epoch": 0.38285714285714284, "grad_norm": 0.4788146913051605, "kl": 0.447509765625, "learning_rate": 3.6696851061588994e-07, "loss": 0.0589, "reward": 0.47583791986107826, "reward_std": 0.6539599671959877, "rewards/cosine_scaled_reward": -0.08499772474169731, "rewards/format_reward": 0.6458333432674408, "step": 335 }, { "completion_length": 2945.4584350585938, "epoch": 0.384, "grad_norm": 0.6187959313392639, "kl": 0.59814453125, "learning_rate": 3.641030065789562e-07, "loss": 0.1016, "reward": 0.08771202201023698, "reward_std": 0.7820224016904831, "rewards/cosine_scaled_reward": -0.23739399760961533, "rewards/format_reward": 0.5625000298023224, "step": 336 }, { "completion_length": 2430.8958740234375, "epoch": 0.3851428571428571, "grad_norm": 0.7578234672546387, "kl": 0.46826171875, "learning_rate": 3.612465628992203e-07, "loss": 0.0748, "reward": 0.5553858801722527, "reward_std": 0.7994070649147034, "rewards/cosine_scaled_reward": -0.06605706363916397, "rewards/format_reward": 0.6875000149011612, "step": 337 }, { "completion_length": 2227.916717529297, "epoch": 0.3862857142857143, "grad_norm": 0.8869759440422058, "kl": 0.354248046875, "learning_rate": 3.5839931879571725e-07, "loss": 0.0831, "reward": 0.7496502324938774, "reward_std": 0.8079821169376373, "rewards/cosine_scaled_reward": -0.0001748921349644661, "rewards/format_reward": 0.7500000223517418, "step": 338 }, { "completion_length": 2985.3334350585938, "epoch": 0.38742857142857146, "grad_norm": 1.4707542657852173, "kl": 0.666015625, "learning_rate": 3.555614130391079e-07, "loss": 0.1233, "reward": 0.36759741231799126, "reward_std": 0.8881158977746964, "rewards/cosine_scaled_reward": -0.06620129197835922, "rewards/format_reward": 0.5000000149011612, "step": 339 }, { "completion_length": 2439.7501220703125, "epoch": 0.38857142857142857, "grad_norm": 2.691328287124634, "kl": 0.453125, "learning_rate": 3.5273298394491515e-07, "loss": -0.0493, "reward": 1.0150221139192581, "reward_std": 0.9879051297903061, "rewards/cosine_scaled_reward": 0.11167772859334946, "rewards/format_reward": 0.7916666865348816, "step": 340 }, { "completion_length": 2257.937530517578, "epoch": 0.38971428571428574, "grad_norm": 0.7236793637275696, "kl": 0.3848876953125, "learning_rate": 3.4991416936678276e-07, "loss": 0.0485, "reward": 1.5515939444303513, "reward_std": 0.958163395524025, "rewards/cosine_scaled_reward": 0.35913030058145523, "rewards/format_reward": 0.8333333432674408, "step": 341 }, { "completion_length": 2541.0834045410156, "epoch": 0.39085714285714285, "grad_norm": 0.982089102268219, "kl": 0.48095703125, "learning_rate": 3.471051066897562e-07, "loss": 0.0531, "reward": 0.5335123301483691, "reward_std": 0.8991846293210983, "rewards/cosine_scaled_reward": -0.09782716228437494, "rewards/format_reward": 0.7291666865348816, "step": 342 }, { "completion_length": 2201.8125915527344, "epoch": 0.392, "grad_norm": 3.367811918258667, "kl": 0.84130859375, "learning_rate": 3.4430593282358777e-07, "loss": 0.0659, "reward": 1.025502122938633, "reward_std": 0.8074321299791336, "rewards/cosine_scaled_reward": 0.11691772192716599, "rewards/format_reward": 0.7916666865348816, "step": 343 }, { "completion_length": 2793.7501220703125, "epoch": 0.3931428571428571, "grad_norm": 0.6109259724617004, "kl": 0.50537109375, "learning_rate": 3.4151678419606233e-07, "loss": 0.0712, "reward": 0.599671695381403, "reward_std": 0.8611319363117218, "rewards/cosine_scaled_reward": -0.04391413927078247, "rewards/format_reward": 0.6875000149011612, "step": 344 }, { "completion_length": 2314.166778564453, "epoch": 0.3942857142857143, "grad_norm": 0.6686170697212219, "kl": 0.5712890625, "learning_rate": 3.387377967463493e-07, "loss": 0.0395, "reward": 0.634972408413887, "reward_std": 0.6707823574542999, "rewards/cosine_scaled_reward": -0.05751381441950798, "rewards/format_reward": 0.7500000149011612, "step": 345 }, { "completion_length": 2902.9584350585938, "epoch": 0.3954285714285714, "grad_norm": 0.6067929863929749, "kl": 0.57958984375, "learning_rate": 3.359691059183761e-07, "loss": 0.1087, "reward": 0.4132253248244524, "reward_std": 0.8897982537746429, "rewards/cosine_scaled_reward": -0.05380401201546192, "rewards/format_reward": 0.5208333507180214, "step": 346 }, { "completion_length": 1998.2083740234375, "epoch": 0.3965714285714286, "grad_norm": 0.9779978394508362, "kl": 0.2724609375, "learning_rate": 3.3321084665422803e-07, "loss": 0.0409, "reward": 1.1842745244503021, "reward_std": 1.0255057215690613, "rewards/cosine_scaled_reward": 0.17547059804201126, "rewards/format_reward": 0.8333333432674408, "step": 347 }, { "completion_length": 2504.416748046875, "epoch": 0.3977142857142857, "grad_norm": 0.7763749957084656, "kl": 0.45556640625, "learning_rate": 3.3046315338757026e-07, "loss": 0.0586, "reward": 0.6250789314508438, "reward_std": 0.745910570025444, "rewards/cosine_scaled_reward": -0.020793883129954338, "rewards/format_reward": 0.6666666865348816, "step": 348 }, { "completion_length": 1897.9375915527344, "epoch": 0.39885714285714285, "grad_norm": 0.759898841381073, "kl": 0.2515869140625, "learning_rate": 3.2772616003709616e-07, "loss": -0.0009, "reward": 1.2401193976402283, "reward_std": 0.7767119854688644, "rewards/cosine_scaled_reward": 0.18255970953032374, "rewards/format_reward": 0.8750000149011612, "step": 349 }, { "completion_length": 1845.9792175292969, "epoch": 0.4, "grad_norm": 0.5678505301475525, "kl": 0.2552642822265625, "learning_rate": 3.250000000000001e-07, "loss": 0.0331, "reward": 1.1045997142791748, "reward_std": 0.6993750482797623, "rewards/cosine_scaled_reward": 0.13563317246735096, "rewards/format_reward": 0.8333333432674408, "step": 350 }, { "completion_length": 2162.1250610351562, "epoch": 0.40114285714285713, "grad_norm": 0.8248549699783325, "kl": 0.34246826171875, "learning_rate": 3.222848061454764e-07, "loss": 0.0701, "reward": 0.6730905398726463, "reward_std": 1.0314117968082428, "rewards/cosine_scaled_reward": -0.03845473984256387, "rewards/format_reward": 0.75, "step": 351 }, { "completion_length": 2398.8750915527344, "epoch": 0.4022857142857143, "grad_norm": 0.7086507678031921, "kl": 0.337158203125, "learning_rate": 3.195807108082429e-07, "loss": 0.0512, "reward": 0.5578571353107691, "reward_std": 0.8292429894208908, "rewards/cosine_scaled_reward": -0.10648808628320694, "rewards/format_reward": 0.770833358168602, "step": 352 }, { "completion_length": 2411.541778564453, "epoch": 0.4034285714285714, "grad_norm": 0.43448832631111145, "kl": 0.3551025390625, "learning_rate": 3.168878457820915e-07, "loss": 0.032, "reward": 0.7701159529387951, "reward_std": 0.8441641330718994, "rewards/cosine_scaled_reward": 0.010057959705591202, "rewards/format_reward": 0.75, "step": 353 }, { "completion_length": 2516.8750915527344, "epoch": 0.4045714285714286, "grad_norm": 0.47943782806396484, "kl": 0.382568359375, "learning_rate": 3.142063423134644e-07, "loss": 0.0606, "reward": 0.435189101845026, "reward_std": 0.6631861850619316, "rewards/cosine_scaled_reward": -0.13657212257385254, "rewards/format_reward": 0.708333358168602, "step": 354 }, { "completion_length": 1538.3125610351562, "epoch": 0.4057142857142857, "grad_norm": 0.3774828314781189, "kl": 0.3017578125, "learning_rate": 3.115363310950578e-07, "loss": 0.0368, "reward": 0.8316129595041275, "reward_std": 0.5808935090899467, "rewards/cosine_scaled_reward": -0.021693539805710316, "rewards/format_reward": 0.8750000149011612, "step": 355 }, { "completion_length": 2109.8333435058594, "epoch": 0.40685714285714286, "grad_norm": 0.3181619346141815, "kl": 0.30126953125, "learning_rate": 3.0887794225945143e-07, "loss": 0.0337, "reward": 0.5732035748660564, "reward_std": 0.6602266579866409, "rewards/cosine_scaled_reward": -0.057148221880197525, "rewards/format_reward": 0.6875000298023224, "step": 356 }, { "completion_length": 2442.1458740234375, "epoch": 0.408, "grad_norm": 0.8465009927749634, "kl": 0.5537109375, "learning_rate": 3.062313053727671e-07, "loss": 0.0438, "reward": 0.5404957421123981, "reward_std": 0.6692793369293213, "rewards/cosine_scaled_reward": -0.08391880989074707, "rewards/format_reward": 0.708333358168602, "step": 357 }, { "completion_length": 2172.5001220703125, "epoch": 0.40914285714285714, "grad_norm": 0.5915915966033936, "kl": 0.2880859375, "learning_rate": 3.0359654942835247e-07, "loss": 0.04, "reward": 0.9776165038347244, "reward_std": 0.8002345710992813, "rewards/cosine_scaled_reward": 0.07214158028364182, "rewards/format_reward": 0.833333358168602, "step": 358 }, { "completion_length": 1994.7709350585938, "epoch": 0.4102857142857143, "grad_norm": 0.5695796608924866, "kl": 0.33642578125, "learning_rate": 3.0097380284049523e-07, "loss": 0.0421, "reward": 0.5635941876098514, "reward_std": 0.682354062795639, "rewards/cosine_scaled_reward": -0.08278624271042645, "rewards/format_reward": 0.7291666716337204, "step": 359 }, { "completion_length": 1582.5625305175781, "epoch": 0.4114285714285714, "grad_norm": 0.6911218166351318, "kl": 0.187103271484375, "learning_rate": 2.9836319343816397e-07, "loss": 0.038, "reward": 0.9810230135917664, "reward_std": 0.6732440888881683, "rewards/cosine_scaled_reward": 0.03217813931405544, "rewards/format_reward": 0.9166666865348816, "step": 360 }, { "completion_length": 1716.8541870117188, "epoch": 0.4125714285714286, "grad_norm": 0.755465567111969, "kl": 0.2716064453125, "learning_rate": 2.9576484845877793e-07, "loss": -0.0037, "reward": 0.4921398665755987, "reward_std": 0.7469517663121223, "rewards/cosine_scaled_reward": -0.10809672623872757, "rewards/format_reward": 0.7083333432674408, "step": 361 }, { "completion_length": 2381.7708435058594, "epoch": 0.4137142857142857, "grad_norm": 0.4649311900138855, "kl": 0.435546875, "learning_rate": 2.931788945420058e-07, "loss": 0.0655, "reward": 0.3485546410083771, "reward_std": 0.8100304752588272, "rewards/cosine_scaled_reward": -0.13822269346565008, "rewards/format_reward": 0.6250000149011612, "step": 362 }, { "completion_length": 2278.6876220703125, "epoch": 0.41485714285714287, "grad_norm": 0.38487836718559265, "kl": 0.3544921875, "learning_rate": 2.9060545772359305e-07, "loss": 0.0483, "reward": 0.6228149347007275, "reward_std": 0.7660052478313446, "rewards/cosine_scaled_reward": -0.05317586287856102, "rewards/format_reward": 0.7291666865348816, "step": 363 }, { "completion_length": 1783.0834045410156, "epoch": 0.416, "grad_norm": 0.6700667142868042, "kl": 0.27978515625, "learning_rate": 2.8804466342921987e-07, "loss": 0.006, "reward": 0.5264641232788563, "reward_std": 0.7023270279169083, "rewards/cosine_scaled_reward": -0.12218462734017521, "rewards/format_reward": 0.7708333432674408, "step": 364 }, { "completion_length": 1910.2500305175781, "epoch": 0.41714285714285715, "grad_norm": 0.7392496466636658, "kl": 0.290771484375, "learning_rate": 2.854966364683872e-07, "loss": 0.0333, "reward": 0.8516478016972542, "reward_std": 0.938531182706356, "rewards/cosine_scaled_reward": 0.009157223626971245, "rewards/format_reward": 0.8333333432674408, "step": 365 }, { "completion_length": 2063.8958740234375, "epoch": 0.41828571428571426, "grad_norm": 1.9315472841262817, "kl": 0.2879638671875, "learning_rate": 2.829615010283344e-07, "loss": 0.068, "reward": 0.9369229730218649, "reward_std": 0.8918980956077576, "rewards/cosine_scaled_reward": 0.09346149861812592, "rewards/format_reward": 0.7500000298023224, "step": 366 }, { "completion_length": 1400.2917175292969, "epoch": 0.41942857142857143, "grad_norm": 0.2165093868970871, "kl": 0.1763916015625, "learning_rate": 2.8043938066798645e-07, "loss": 0.0193, "reward": 0.9957811124622822, "reward_std": 0.45480820536613464, "rewards/cosine_scaled_reward": 0.04997388273477554, "rewards/format_reward": 0.8958333432674408, "step": 367 }, { "completion_length": 1647.7916717529297, "epoch": 0.4205714285714286, "grad_norm": 0.7413077354431152, "kl": 0.174774169921875, "learning_rate": 2.7793039831193133e-07, "loss": -0.0034, "reward": 0.8528083562850952, "reward_std": 0.8265992403030396, "rewards/cosine_scaled_reward": 0.009737495332956314, "rewards/format_reward": 0.833333358168602, "step": 368 }, { "completion_length": 1487.3958435058594, "epoch": 0.4217142857142857, "grad_norm": 0.6509503722190857, "kl": 0.12530517578125, "learning_rate": 2.7543467624442956e-07, "loss": -0.0257, "reward": 0.9031364023685455, "reward_std": 0.9219841361045837, "rewards/cosine_scaled_reward": 0.03490149416029453, "rewards/format_reward": 0.833333358168602, "step": 369 }, { "completion_length": 2323.229248046875, "epoch": 0.4228571428571429, "grad_norm": 1.1870368719100952, "kl": 0.2625732421875, "learning_rate": 2.729523361034538e-07, "loss": -0.0417, "reward": 0.7300510033965111, "reward_std": 0.8341569006443024, "rewards/cosine_scaled_reward": -0.051641182973980904, "rewards/format_reward": 0.833333358168602, "step": 370 }, { "completion_length": 1885.0834045410156, "epoch": 0.424, "grad_norm": 0.3413795232772827, "kl": 0.232666015625, "learning_rate": 2.7048349887476037e-07, "loss": 0.0358, "reward": 0.33694031462073326, "reward_std": 0.7036072686314583, "rewards/cosine_scaled_reward": -0.21694651246070862, "rewards/format_reward": 0.7708333432674408, "step": 371 }, { "completion_length": 2071.812530517578, "epoch": 0.42514285714285716, "grad_norm": 0.9272376894950867, "kl": 0.242919921875, "learning_rate": 2.6802828488599294e-07, "loss": -0.0016, "reward": 0.9880311861634254, "reward_std": 0.629561685025692, "rewards/cosine_scaled_reward": 0.025265559554100037, "rewards/format_reward": 0.9375000149011612, "step": 372 }, { "completion_length": 2372.0834045410156, "epoch": 0.42628571428571427, "grad_norm": 0.8849138617515564, "kl": 0.249755859375, "learning_rate": 2.655868138008171e-07, "loss": 0.0062, "reward": 0.7052676677703857, "reward_std": 0.6477234065532684, "rewards/cosine_scaled_reward": -0.07444952987134457, "rewards/format_reward": 0.8541667014360428, "step": 373 }, { "completion_length": 2331.8125610351562, "epoch": 0.42742857142857144, "grad_norm": 0.5580031275749207, "kl": 0.309814453125, "learning_rate": 2.631592046130896e-07, "loss": 0.0456, "reward": 0.6995935346931219, "reward_std": 0.7008600682020187, "rewards/cosine_scaled_reward": 0.00604674918577075, "rewards/format_reward": 0.6875000223517418, "step": 374 }, { "completion_length": 1906.2083740234375, "epoch": 0.42857142857142855, "grad_norm": 0.5966392755508423, "kl": 0.3814697265625, "learning_rate": 2.6074557564105724e-07, "loss": 0.0463, "reward": 0.7689145356416702, "reward_std": 0.7337282001972198, "rewards/cosine_scaled_reward": -0.02179272472858429, "rewards/format_reward": 0.8125000298023224, "step": 375 }, { "completion_length": 1971.3125915527344, "epoch": 0.4297142857142857, "grad_norm": 1.3154016733169556, "kl": 0.175048828125, "learning_rate": 2.583460445215911e-07, "loss": 0.0574, "reward": 0.968916192650795, "reward_std": 0.9032018631696701, "rewards/cosine_scaled_reward": 0.0677914135158062, "rewards/format_reward": 0.8333333432674408, "step": 376 }, { "completion_length": 2224.666748046875, "epoch": 0.4308571428571429, "grad_norm": 0.892139196395874, "kl": 0.1807861328125, "learning_rate": 2.5596072820445254e-07, "loss": 0.0031, "reward": 1.009105697274208, "reward_std": 0.9417294263839722, "rewards/cosine_scaled_reward": 0.09830283187329769, "rewards/format_reward": 0.8125000298023224, "step": 377 }, { "completion_length": 2115.2709045410156, "epoch": 0.432, "grad_norm": 0.9765793085098267, "kl": 0.26611328125, "learning_rate": 2.5358974294659373e-07, "loss": 0.0684, "reward": 0.5737282857298851, "reward_std": 0.6101915389299393, "rewards/cosine_scaled_reward": -0.12980252876877785, "rewards/format_reward": 0.8333333432674408, "step": 378 }, { "completion_length": 1658.2916870117188, "epoch": 0.43314285714285716, "grad_norm": 0.20954985916614532, "kl": 0.232666015625, "learning_rate": 2.512332043064913e-07, "loss": 0.0026, "reward": 0.6455265134572983, "reward_std": 0.5983955562114716, "rewards/cosine_scaled_reward": -0.08348675072193146, "rewards/format_reward": 0.8125000298023224, "step": 379 }, { "completion_length": 2212.4375610351562, "epoch": 0.4342857142857143, "grad_norm": 1.3722639083862305, "kl": 0.3023681640625, "learning_rate": 2.488912271385139e-07, "loss": 0.0993, "reward": 0.5304721817374229, "reward_std": 0.7781679779291153, "rewards/cosine_scaled_reward": -0.10976393148303032, "rewards/format_reward": 0.7500000149011612, "step": 380 }, { "completion_length": 1918.8750305175781, "epoch": 0.43542857142857144, "grad_norm": 0.7221528887748718, "kl": 0.295806884765625, "learning_rate": 2.465639255873246e-07, "loss": 0.0029, "reward": 0.9501378051936626, "reward_std": 0.6066517308354378, "rewards/cosine_scaled_reward": 0.047985561192035675, "rewards/format_reward": 0.8541666865348816, "step": 381 }, { "completion_length": 1793.6459045410156, "epoch": 0.43657142857142855, "grad_norm": 0.25511884689331055, "kl": 0.2493896484375, "learning_rate": 2.4425141308231765e-07, "loss": 0.0226, "reward": 0.9860572461038828, "reward_std": 0.6644920855760574, "rewards/cosine_scaled_reward": 0.08677859604358673, "rewards/format_reward": 0.8125000298023224, "step": 382 }, { "completion_length": 1990.2500915527344, "epoch": 0.4377142857142857, "grad_norm": 0.4499902129173279, "kl": 0.25341796875, "learning_rate": 2.4195380233209006e-07, "loss": 0.053, "reward": 0.7591063939034939, "reward_std": 0.5849988833069801, "rewards/cosine_scaled_reward": -0.04753013700246811, "rewards/format_reward": 0.8541667014360428, "step": 383 }, { "completion_length": 1889.8750610351562, "epoch": 0.43885714285714283, "grad_norm": 0.34465470910072327, "kl": 0.23828125, "learning_rate": 2.3967120531894857e-07, "loss": 0.0019, "reward": 0.626850601285696, "reward_std": 0.5293265283107758, "rewards/cosine_scaled_reward": -0.12407470063772053, "rewards/format_reward": 0.8750000149011612, "step": 384 }, { "completion_length": 1649.5208587646484, "epoch": 0.44, "grad_norm": 1.0988309383392334, "kl": 0.2174072265625, "learning_rate": 2.374037332934512e-07, "loss": 0.046, "reward": 0.8215210735797882, "reward_std": 0.7156432569026947, "rewards/cosine_scaled_reward": -0.005906133679673076, "rewards/format_reward": 0.833333358168602, "step": 385 }, { "completion_length": 1869.666748046875, "epoch": 0.44114285714285717, "grad_norm": 0.31057262420654297, "kl": 0.213623046875, "learning_rate": 2.3515149676898552e-07, "loss": 0.0312, "reward": 0.7544382140040398, "reward_std": 0.5287479311227798, "rewards/cosine_scaled_reward": -0.09153091069310904, "rewards/format_reward": 0.9375000149011612, "step": 386 }, { "completion_length": 2174.000030517578, "epoch": 0.4422857142857143, "grad_norm": 0.7334949374198914, "kl": 0.2723388671875, "learning_rate": 2.3291460551638237e-07, "loss": 0.0412, "reward": 0.7021404728293419, "reward_std": 0.8102448135614395, "rewards/cosine_scaled_reward": -0.07601310685276985, "rewards/format_reward": 0.8541666865348816, "step": 387 }, { "completion_length": 1764.0625305175781, "epoch": 0.44342857142857145, "grad_norm": 0.8506814241409302, "kl": 0.211181640625, "learning_rate": 2.306931685585657e-07, "loss": 0.0326, "reward": 0.9473480954766273, "reward_std": 0.7040945738554001, "rewards/cosine_scaled_reward": 0.025757367722690105, "rewards/format_reward": 0.895833358168602, "step": 388 }, { "completion_length": 1558.6875915527344, "epoch": 0.44457142857142856, "grad_norm": 1.0051478147506714, "kl": 0.10626220703125, "learning_rate": 2.2848729416523859e-07, "loss": 0.0233, "reward": 1.2105353027582169, "reward_std": 0.7370782792568207, "rewards/cosine_scaled_reward": 0.14693431742489338, "rewards/format_reward": 0.9166666865348816, "step": 389 }, { "completion_length": 1673.0417175292969, "epoch": 0.44571428571428573, "grad_norm": 1.0045956373214722, "kl": 0.324462890625, "learning_rate": 2.2629708984760706e-07, "loss": -0.0122, "reward": 0.682011567056179, "reward_std": 0.668542355298996, "rewards/cosine_scaled_reward": -0.08607756206765771, "rewards/format_reward": 0.8541666865348816, "step": 390 }, { "completion_length": 1759.604248046875, "epoch": 0.44685714285714284, "grad_norm": 0.8641379475593567, "kl": 0.30419921875, "learning_rate": 2.2412266235313973e-07, "loss": -0.0151, "reward": 0.40198634564876556, "reward_std": 0.4891185835003853, "rewards/cosine_scaled_reward": -0.23650683648884296, "rewards/format_reward": 0.8750000149011612, "step": 391 }, { "completion_length": 1997.0208740234375, "epoch": 0.448, "grad_norm": 0.601497232913971, "kl": 0.3251953125, "learning_rate": 2.2196411766036487e-07, "loss": 0.0246, "reward": 1.31626558303833, "reward_std": 0.8470017611980438, "rewards/cosine_scaled_reward": 0.2206327999010682, "rewards/format_reward": 0.8750000149011612, "step": 392 }, { "completion_length": 1767.3958740234375, "epoch": 0.4491428571428571, "grad_norm": 0.9790117740631104, "kl": 0.20623779296875, "learning_rate": 2.1982156097370557e-07, "loss": 0.0716, "reward": 1.0628649685531855, "reward_std": 0.7842252627015114, "rewards/cosine_scaled_reward": 0.09393247216939926, "rewards/format_reward": 0.8750000149011612, "step": 393 }, { "completion_length": 2281.5625610351562, "epoch": 0.4502857142857143, "grad_norm": 0.9092360138893127, "kl": 0.2666015625, "learning_rate": 2.1769509671835223e-07, "loss": 0.0071, "reward": 0.7091562300920486, "reward_std": 0.6370756179094315, "rewards/cosine_scaled_reward": -0.09333855286240578, "rewards/format_reward": 0.8958333432674408, "step": 394 }, { "completion_length": 2072.7083740234375, "epoch": 0.4514285714285714, "grad_norm": 0.6948179006576538, "kl": 0.335205078125, "learning_rate": 2.1558482853517253e-07, "loss": 0.0399, "reward": 0.6186719592660666, "reward_std": 0.8180225193500519, "rewards/cosine_scaled_reward": -0.06566403433680534, "rewards/format_reward": 0.7500000298023224, "step": 395 }, { "completion_length": 1713.0625305175781, "epoch": 0.45257142857142857, "grad_norm": 1.03392493724823, "kl": 0.2850341796875, "learning_rate": 2.134908592756607e-07, "loss": 0.0576, "reward": 0.6681124269962311, "reward_std": 0.72493577003479, "rewards/cosine_scaled_reward": -0.07219376973807812, "rewards/format_reward": 0.8125000149011612, "step": 396 }, { "completion_length": 2008.166748046875, "epoch": 0.45371428571428574, "grad_norm": 1.2174099683761597, "kl": 0.3359375, "learning_rate": 2.1141329099692406e-07, "loss": 0.0821, "reward": 1.3461299315094948, "reward_std": 0.8196755945682526, "rewards/cosine_scaled_reward": 0.2668149508535862, "rewards/format_reward": 0.8125000298023224, "step": 397 }, { "completion_length": 1758.4167175292969, "epoch": 0.45485714285714285, "grad_norm": 0.7967256307601929, "kl": 0.3011474609375, "learning_rate": 2.0935222495670968e-07, "loss": 0.0175, "reward": 1.0533079504966736, "reward_std": 0.9479693919420242, "rewards/cosine_scaled_reward": 0.057903981767594814, "rewards/format_reward": 0.9375000149011612, "step": 398 }, { "completion_length": 2110.0000610351562, "epoch": 0.456, "grad_norm": 0.6236258149147034, "kl": 0.3653564453125, "learning_rate": 2.0730776160846853e-07, "loss": 0.0552, "reward": 0.8325799964368343, "reward_std": 0.6572683453559875, "rewards/cosine_scaled_reward": -0.00037669437006115913, "rewards/format_reward": 0.8333333432674408, "step": 399 }, { "completion_length": 1693.8333587646484, "epoch": 0.45714285714285713, "grad_norm": 0.5594977736473083, "kl": 0.239166259765625, "learning_rate": 2.0528000059645995e-07, "loss": 0.0092, "reward": 0.5645224675536156, "reward_std": 0.47261467576026917, "rewards/cosine_scaled_reward": -0.18648880254477262, "rewards/format_reward": 0.9375000149011612, "step": 400 }, { "completion_length": 2298.4375610351562, "epoch": 0.4582857142857143, "grad_norm": 0.46592381596565247, "kl": 0.5498046875, "learning_rate": 2.032690407508949e-07, "loss": 0.0651, "reward": 0.7146447077393532, "reward_std": 0.9194528758525848, "rewards/cosine_scaled_reward": -0.05934431403875351, "rewards/format_reward": 0.833333358168602, "step": 401 }, { "completion_length": 2858.3959350585938, "epoch": 0.4594285714285714, "grad_norm": 1.3920950889587402, "kl": 0.701171875, "learning_rate": 2.0127498008311922e-07, "loss": 0.0681, "reward": 0.38334885984659195, "reward_std": 0.6373907253146172, "rewards/cosine_scaled_reward": -0.20415889844298363, "rewards/format_reward": 0.7916666865348816, "step": 402 }, { "completion_length": 2565.5626220703125, "epoch": 0.4605714285714286, "grad_norm": 1.1024017333984375, "kl": 0.625, "learning_rate": 1.9929791578083655e-07, "loss": 0.029, "reward": 0.801287055015564, "reward_std": 0.897977739572525, "rewards/cosine_scaled_reward": -0.03685649996623397, "rewards/format_reward": 0.8750000298023224, "step": 403 }, { "completion_length": 2505.916748046875, "epoch": 0.4617142857142857, "grad_norm": 1.409442663192749, "kl": 0.65576171875, "learning_rate": 1.9733794420337213e-07, "loss": 0.0304, "reward": 1.2360095381736755, "reward_std": 0.7143290638923645, "rewards/cosine_scaled_reward": 0.18050476163625717, "rewards/format_reward": 0.8750000149011612, "step": 404 }, { "completion_length": 2441.0000610351562, "epoch": 0.46285714285714286, "grad_norm": 0.8860685229301453, "kl": 0.64306640625, "learning_rate": 1.9539516087697517e-07, "loss": 0.0652, "reward": 1.0503446012735367, "reward_std": 0.8782050907611847, "rewards/cosine_scaled_reward": 0.10850561456754804, "rewards/format_reward": 0.833333358168602, "step": 405 }, { "completion_length": 2316.562530517578, "epoch": 0.464, "grad_norm": 0.9385198354721069, "kl": 0.6611328125, "learning_rate": 1.934696604901642e-07, "loss": 0.039, "reward": 0.8388771619647741, "reward_std": 0.5718994289636612, "rewards/cosine_scaled_reward": -0.007644776254892349, "rewards/format_reward": 0.8541666865348816, "step": 406 }, { "completion_length": 2314.6459045410156, "epoch": 0.46514285714285714, "grad_norm": 1.216766357421875, "kl": 0.55029296875, "learning_rate": 1.915615368891117e-07, "loss": 0.0239, "reward": 0.8419212326407433, "reward_std": 0.65188068151474, "rewards/cosine_scaled_reward": -0.037372760474681854, "rewards/format_reward": 0.9166666865348816, "step": 407 }, { "completion_length": 2388.791717529297, "epoch": 0.4662857142857143, "grad_norm": 0.6723232865333557, "kl": 0.4609375, "learning_rate": 1.8967088307307e-07, "loss": 0.048, "reward": 1.100903958082199, "reward_std": 0.7514118552207947, "rewards/cosine_scaled_reward": 0.10253530507907271, "rewards/format_reward": 0.8958333432674408, "step": 408 }, { "completion_length": 2204.958465576172, "epoch": 0.4674285714285714, "grad_norm": 0.9829697012901306, "kl": 0.53759765625, "learning_rate": 1.8779779118983867e-07, "loss": 0.0237, "reward": 1.0897281467914581, "reward_std": 0.4026891812682152, "rewards/cosine_scaled_reward": 0.10736404359340668, "rewards/format_reward": 0.8750000149011612, "step": 409 }, { "completion_length": 2508.729248046875, "epoch": 0.4685714285714286, "grad_norm": 1.1136001348495483, "kl": 0.58837890625, "learning_rate": 1.8594235253127372e-07, "loss": 0.0388, "reward": 0.4441644148901105, "reward_std": 0.8706175982952118, "rewards/cosine_scaled_reward": -0.1425011307001114, "rewards/format_reward": 0.7291666865348816, "step": 410 }, { "completion_length": 2758.3543090820312, "epoch": 0.4697142857142857, "grad_norm": 1.1172066926956177, "kl": 0.52685546875, "learning_rate": 1.8410465752883758e-07, "loss": 0.0214, "reward": 0.38340113312005997, "reward_std": 0.6312393695116043, "rewards/cosine_scaled_reward": -0.235382791608572, "rewards/format_reward": 0.8541667014360428, "step": 411 }, { "completion_length": 2535.416717529297, "epoch": 0.47085714285714286, "grad_norm": 1.1818182468414307, "kl": 0.579833984375, "learning_rate": 1.822847957491922e-07, "loss": 0.028, "reward": 0.8752952516078949, "reward_std": 0.5417208820581436, "rewards/cosine_scaled_reward": 0.02098093181848526, "rewards/format_reward": 0.8333333432674408, "step": 412 }, { "completion_length": 2696.1250610351562, "epoch": 0.472, "grad_norm": 0.5541598796844482, "kl": 0.5654296875, "learning_rate": 1.804828558898332e-07, "loss": 0.0783, "reward": 0.497568441554904, "reward_std": 0.7255310416221619, "rewards/cosine_scaled_reward": -0.10538244433701038, "rewards/format_reward": 0.708333358168602, "step": 413 }, { "completion_length": 2673.8751220703125, "epoch": 0.47314285714285714, "grad_norm": 0.9568617343902588, "kl": 0.53369140625, "learning_rate": 1.7869892577476722e-07, "loss": 0.0999, "reward": 0.9199014604091644, "reward_std": 0.8385901600122452, "rewards/cosine_scaled_reward": 0.053700722055509686, "rewards/format_reward": 0.8125000298023224, "step": 414 }, { "completion_length": 2954.5418090820312, "epoch": 0.4742857142857143, "grad_norm": 1.3337595462799072, "kl": 0.607421875, "learning_rate": 1.7693309235023127e-07, "loss": 0.0298, "reward": 0.843063585460186, "reward_std": 0.9124226570129395, "rewards/cosine_scaled_reward": 0.0048651136457920074, "rewards/format_reward": 0.833333358168602, "step": 415 }, { "completion_length": 2910.4375610351562, "epoch": 0.4754285714285714, "grad_norm": 0.6592503786087036, "kl": 0.61865234375, "learning_rate": 1.7518544168045524e-07, "loss": 0.0733, "reward": 0.46045139618217945, "reward_std": 0.8773138746619225, "rewards/cosine_scaled_reward": -0.10310766100883484, "rewards/format_reward": 0.6666666865348816, "step": 416 }, { "completion_length": 2641.2918090820312, "epoch": 0.4765714285714286, "grad_norm": 0.829136073589325, "kl": 0.49462890625, "learning_rate": 1.7345605894346726e-07, "loss": 0.057, "reward": 0.9838578663766384, "reward_std": 0.7910896837711334, "rewards/cosine_scaled_reward": 0.054428933188319206, "rewards/format_reward": 0.8750000298023224, "step": 417 }, { "completion_length": 2964.3541870117188, "epoch": 0.4777142857142857, "grad_norm": 0.9262496829032898, "kl": 0.5478515625, "learning_rate": 1.7174502842694212e-07, "loss": 0.0357, "reward": 0.6633618324995041, "reward_std": 0.6466763466596603, "rewards/cosine_scaled_reward": -0.10581910982728004, "rewards/format_reward": 0.8750000149011612, "step": 418 }, { "completion_length": 2773.5626220703125, "epoch": 0.47885714285714287, "grad_norm": 0.8558900952339172, "kl": 0.49072265625, "learning_rate": 1.7005243352409333e-07, "loss": 0.0542, "reward": 0.6305762082338333, "reward_std": 0.7357209548354149, "rewards/cosine_scaled_reward": -0.080545240547508, "rewards/format_reward": 0.7916666716337204, "step": 419 }, { "completion_length": 2345.8959045410156, "epoch": 0.48, "grad_norm": 0.6529119610786438, "kl": 0.3431396484375, "learning_rate": 1.6837835672960831e-07, "loss": 0.0226, "reward": 1.2573866918683052, "reward_std": 0.9116456806659698, "rewards/cosine_scaled_reward": 0.1911933235824108, "rewards/format_reward": 0.8750000149011612, "step": 420 }, { "completion_length": 2318.1875915527344, "epoch": 0.48114285714285715, "grad_norm": 0.6412160396575928, "kl": 0.35498046875, "learning_rate": 1.6672287963562852e-07, "loss": 0.0124, "reward": 1.0443747788667679, "reward_std": 0.7097911983728409, "rewards/cosine_scaled_reward": 0.09510404244065285, "rewards/format_reward": 0.8541666865348816, "step": 421 }, { "completion_length": 2448.5833740234375, "epoch": 0.48228571428571426, "grad_norm": 0.6165621280670166, "kl": 0.421875, "learning_rate": 1.6508608292777203e-07, "loss": 0.0385, "reward": 0.7055833786725998, "reward_std": 0.7713779509067535, "rewards/cosine_scaled_reward": -0.053458321839571, "rewards/format_reward": 0.8125000298023224, "step": 422 }, { "completion_length": 2370.479278564453, "epoch": 0.48342857142857143, "grad_norm": 1.0260326862335205, "kl": 0.325927734375, "learning_rate": 1.6346804638120098e-07, "loss": 0.0657, "reward": 0.8030254691839218, "reward_std": 0.8349241316318512, "rewards/cosine_scaled_reward": -0.015153962187469006, "rewards/format_reward": 0.8333333432674408, "step": 423 }, { "completion_length": 2863.0833740234375, "epoch": 0.4845714285714286, "grad_norm": 0.8439249396324158, "kl": 0.43115234375, "learning_rate": 1.6186884885673413e-07, "loss": 0.0713, "reward": 0.4908841624855995, "reward_std": 0.8119627386331558, "rewards/cosine_scaled_reward": -0.15039126574993134, "rewards/format_reward": 0.7916666865348816, "step": 424 }, { "completion_length": 2920.604248046875, "epoch": 0.4857142857142857, "grad_norm": 0.7168906927108765, "kl": 0.455078125, "learning_rate": 1.6028856829700258e-07, "loss": 0.0577, "reward": 0.8773088157176971, "reward_std": 0.8730379045009613, "rewards/cosine_scaled_reward": 0.032404396682977676, "rewards/format_reward": 0.8125000149011612, "step": 425 }, { "completion_length": 2877.354248046875, "epoch": 0.4868571428571429, "grad_norm": 0.7351894974708557, "kl": 0.3916015625, "learning_rate": 1.5872728172265146e-07, "loss": 0.0252, "reward": 1.0884526520967484, "reward_std": 0.8330738395452499, "rewards/cosine_scaled_reward": 0.10672629997134209, "rewards/format_reward": 0.8750000298023224, "step": 426 }, { "completion_length": 2745.041748046875, "epoch": 0.488, "grad_norm": 0.4892515242099762, "kl": 0.33447265625, "learning_rate": 1.5718506522858572e-07, "loss": 0.0358, "reward": 1.0718627832829952, "reward_std": 0.7832525819540024, "rewards/cosine_scaled_reward": 0.11926471255719662, "rewards/format_reward": 0.833333358168602, "step": 427 }, { "completion_length": 2932.1458740234375, "epoch": 0.48914285714285716, "grad_norm": 1.292845606803894, "kl": 0.52685546875, "learning_rate": 1.5566199398026147e-07, "loss": 0.0264, "reward": 0.31675857678055763, "reward_std": 0.5401652418076992, "rewards/cosine_scaled_reward": -0.21662072464823723, "rewards/format_reward": 0.7500000074505806, "step": 428 }, { "completion_length": 2592.3334045410156, "epoch": 0.49028571428571427, "grad_norm": 0.6887741088867188, "kl": 0.39111328125, "learning_rate": 1.5415814221002265e-07, "loss": 0.0134, "reward": 0.8551270663738251, "reward_std": 0.883497804403305, "rewards/cosine_scaled_reward": -0.02035313844680786, "rewards/format_reward": 0.8958333432674408, "step": 429 }, { "completion_length": 2997.7709350585938, "epoch": 0.49142857142857144, "grad_norm": 0.9550595283508301, "kl": 0.4248046875, "learning_rate": 1.5267358321348285e-07, "loss": 0.0977, "reward": 0.7853763314778917, "reward_std": 0.862298920750618, "rewards/cosine_scaled_reward": -0.013561863452196121, "rewards/format_reward": 0.8125000298023224, "step": 430 }, { "completion_length": 2814.6459350585938, "epoch": 0.49257142857142855, "grad_norm": 0.35693833231925964, "kl": 0.42578125, "learning_rate": 1.5120838934595337e-07, "loss": 0.0465, "reward": 0.7550955265760422, "reward_std": 0.797643780708313, "rewards/cosine_scaled_reward": -0.01828559674322605, "rewards/format_reward": 0.7916666865348816, "step": 431 }, { "completion_length": 2680.0833740234375, "epoch": 0.4937142857142857, "grad_norm": 0.3660014867782593, "kl": 0.42138671875, "learning_rate": 1.4976263201891613e-07, "loss": 0.0453, "reward": 0.5384078100323677, "reward_std": 0.6302113831043243, "rewards/cosine_scaled_reward": -0.11621277220547199, "rewards/format_reward": 0.7708333432674408, "step": 432 }, { "completion_length": 3197.0625610351562, "epoch": 0.4948571428571429, "grad_norm": 0.834852397441864, "kl": 0.45703125, "learning_rate": 1.483363816965435e-07, "loss": 0.0415, "reward": 0.677655503153801, "reward_std": 0.997919499874115, "rewards/cosine_scaled_reward": -0.015338926576077938, "rewards/format_reward": 0.7083333432674408, "step": 433 }, { "completion_length": 2018.541748046875, "epoch": 0.496, "grad_norm": 0.3951985836029053, "kl": 0.17779541015625, "learning_rate": 1.469297078922642e-07, "loss": -0.0128, "reward": 1.5104268491268158, "reward_std": 0.6382196992635727, "rewards/cosine_scaled_reward": 0.2760467454791069, "rewards/format_reward": 0.9583333432674408, "step": 434 }, { "completion_length": 2781.45849609375, "epoch": 0.49714285714285716, "grad_norm": 0.8080605268478394, "kl": 0.41552734375, "learning_rate": 1.4554267916537495e-07, "loss": 0.072, "reward": 0.5199687406420708, "reward_std": 0.697292298078537, "rewards/cosine_scaled_reward": -0.11501563712954521, "rewards/format_reward": 0.7500000298023224, "step": 435 }, { "completion_length": 2910.9168090820312, "epoch": 0.4982857142857143, "grad_norm": 1.0082898139953613, "kl": 0.31591796875, "learning_rate": 1.4417536311769885e-07, "loss": 0.0441, "reward": 1.0633302181959152, "reward_std": 0.8466629385948181, "rewards/cosine_scaled_reward": 0.06291508674621582, "rewards/format_reward": 0.9375000149011612, "step": 436 }, { "completion_length": 2581.2500610351562, "epoch": 0.49942857142857144, "grad_norm": 0.5378354787826538, "kl": 0.2705078125, "learning_rate": 1.4282782639029128e-07, "loss": 0.003, "reward": 1.0827649384737015, "reward_std": 0.822308674454689, "rewards/cosine_scaled_reward": 0.10388245154172182, "rewards/format_reward": 0.8750000149011612, "step": 437 }, { "completion_length": 2723.1251220703125, "epoch": 0.5005714285714286, "grad_norm": 0.6586508750915527, "kl": 0.340087890625, "learning_rate": 1.4150013466019114e-07, "loss": 0.046, "reward": 1.0363626778125763, "reward_std": 0.9988095015287399, "rewards/cosine_scaled_reward": 0.10151464305818081, "rewards/format_reward": 0.8333333432674408, "step": 438 }, { "completion_length": 2458.041778564453, "epoch": 0.5017142857142857, "grad_norm": 0.6118423342704773, "kl": 0.3319091796875, "learning_rate": 1.4019235263722034e-07, "loss": 0.059, "reward": 0.7599635235965252, "reward_std": 0.6979039385914803, "rewards/cosine_scaled_reward": -0.05751825252082199, "rewards/format_reward": 0.8750000149011612, "step": 439 }, { "completion_length": 2812.7916870117188, "epoch": 0.5028571428571429, "grad_norm": 0.6263717412948608, "kl": 0.346435546875, "learning_rate": 1.3890454406082956e-07, "loss": 0.0402, "reward": 0.7473399192094803, "reward_std": 0.7950000017881393, "rewards/cosine_scaled_reward": -0.04299671063199639, "rewards/format_reward": 0.833333358168602, "step": 440 }, { "completion_length": 2658.854217529297, "epoch": 0.504, "grad_norm": 0.48751676082611084, "kl": 0.3270263671875, "learning_rate": 1.3763677169699217e-07, "loss": 0.0419, "reward": 0.7070795819163322, "reward_std": 0.773023784160614, "rewards/cosine_scaled_reward": -0.04229356348514557, "rewards/format_reward": 0.7916666716337204, "step": 441 }, { "completion_length": 2351.6250610351562, "epoch": 0.5051428571428571, "grad_norm": 0.5668932199478149, "kl": 0.252685546875, "learning_rate": 1.3638909733514452e-07, "loss": 0.0475, "reward": 0.8659966886043549, "reward_std": 0.5813730582594872, "rewards/cosine_scaled_reward": 0.016331655904650688, "rewards/format_reward": 0.833333358168602, "step": 442 }, { "completion_length": 2956.729248046875, "epoch": 0.5062857142857143, "grad_norm": 0.3870391249656677, "kl": 0.30859375, "learning_rate": 1.351615817851748e-07, "loss": 0.0416, "reward": 1.151278093457222, "reward_std": 0.8103004992008209, "rewards/cosine_scaled_reward": 0.11730570159852505, "rewards/format_reward": 0.9166666865348816, "step": 443 }, { "completion_length": 2814.6876220703125, "epoch": 0.5074285714285715, "grad_norm": 0.5548789501190186, "kl": 0.369140625, "learning_rate": 1.3395428487445914e-07, "loss": 0.0343, "reward": 0.9690770208835602, "reward_std": 0.9044716209173203, "rewards/cosine_scaled_reward": 0.09912180341780186, "rewards/format_reward": 0.770833358168602, "step": 444 }, { "completion_length": 2858.6875610351562, "epoch": 0.5085714285714286, "grad_norm": 0.7488447427749634, "kl": 0.3701171875, "learning_rate": 1.3276726544494571e-07, "loss": 0.0152, "reward": 0.6591560812667012, "reward_std": 0.6855928599834442, "rewards/cosine_scaled_reward": -0.035005307756364346, "rewards/format_reward": 0.7291666865348816, "step": 445 }, { "completion_length": 2472.7500610351562, "epoch": 0.5097142857142857, "grad_norm": 0.5907102227210999, "kl": 0.208251953125, "learning_rate": 1.316005813502869e-07, "loss": 0.0325, "reward": 1.3291829228401184, "reward_std": 0.7747218981385231, "rewards/cosine_scaled_reward": 0.206258125603199, "rewards/format_reward": 0.9166666716337204, "step": 446 }, { "completion_length": 2428.1459350585938, "epoch": 0.5108571428571429, "grad_norm": 0.5603023171424866, "kl": 0.2802734375, "learning_rate": 1.3045428945301953e-07, "loss": 0.0368, "reward": 0.9525867849588394, "reward_std": 0.712784081697464, "rewards/cosine_scaled_reward": 0.038793399930000305, "rewards/format_reward": 0.875, "step": 447 }, { "completion_length": 2589.3958740234375, "epoch": 0.512, "grad_norm": 0.9914929866790771, "kl": 0.297607421875, "learning_rate": 1.2932844562179352e-07, "loss": 0.0567, "reward": 1.3133542239665985, "reward_std": 1.0432665199041367, "rewards/cosine_scaled_reward": 0.27126041799783707, "rewards/format_reward": 0.770833358168602, "step": 448 }, { "completion_length": 2799.166748046875, "epoch": 0.5131428571428571, "grad_norm": 1.0846092700958252, "kl": 0.4091796875, "learning_rate": 1.2822310472864885e-07, "loss": 0.0606, "reward": 0.9647302851080894, "reward_std": 0.7462186589837074, "rewards/cosine_scaled_reward": 0.10736512392759323, "rewards/format_reward": 0.7500000298023224, "step": 449 }, { "completion_length": 2719.916748046875, "epoch": 0.5142857142857142, "grad_norm": 0.5918545126914978, "kl": 0.3916015625, "learning_rate": 1.2713832064634125e-07, "loss": 0.014, "reward": 1.0999898612499237, "reward_std": 0.8317281156778336, "rewards/cosine_scaled_reward": 0.15416158083826303, "rewards/format_reward": 0.7916666865348816, "step": 450 }, { "completion_length": 2748.4584350585938, "epoch": 0.5154285714285715, "grad_norm": 1.2674349546432495, "kl": 0.348876953125, "learning_rate": 1.260741462457165e-07, "loss": 0.0753, "reward": 0.851899653673172, "reward_std": 0.9279103875160217, "rewards/cosine_scaled_reward": 0.019699793308973312, "rewards/format_reward": 0.8125000298023224, "step": 451 }, { "completion_length": 2946.291748046875, "epoch": 0.5165714285714286, "grad_norm": 0.9848341941833496, "kl": 0.4384765625, "learning_rate": 1.2503063339313356e-07, "loss": 0.0244, "reward": 0.7191433683037758, "reward_std": 0.8444506227970123, "rewards/cosine_scaled_reward": -0.0154283307492733, "rewards/format_reward": 0.7500000149011612, "step": 452 }, { "completion_length": 2824.5000610351562, "epoch": 0.5177142857142857, "grad_norm": 1.562027931213379, "kl": 0.450439453125, "learning_rate": 1.2400783294793668e-07, "loss": 0.0678, "reward": 0.9857252687215805, "reward_std": 0.8770118951797485, "rewards/cosine_scaled_reward": 0.1074459683150053, "rewards/format_reward": 0.7708333432674408, "step": 453 }, { "completion_length": 2845.291748046875, "epoch": 0.5188571428571429, "grad_norm": 1.0593106746673584, "kl": 0.399658203125, "learning_rate": 1.2300579475997657e-07, "loss": 0.0191, "reward": 0.5798447616398335, "reward_std": 0.7729413360357285, "rewards/cosine_scaled_reward": -0.11632763035595417, "rewards/format_reward": 0.8125, "step": 454 }, { "completion_length": 2406.979248046875, "epoch": 0.52, "grad_norm": 0.4025033712387085, "kl": 0.32861328125, "learning_rate": 1.220245676671809e-07, "loss": 0.0397, "reward": 1.0016262233257294, "reward_std": 0.6507641598582268, "rewards/cosine_scaled_reward": 0.104979757219553, "rewards/format_reward": 0.7916666865348816, "step": 455 }, { "completion_length": 2493.3125915527344, "epoch": 0.5211428571428571, "grad_norm": 0.6641373038291931, "kl": 0.3935546875, "learning_rate": 1.2106419949317388e-07, "loss": 0.0496, "reward": 0.8123725727200508, "reward_std": 0.6888710185885429, "rewards/cosine_scaled_reward": 0.04160293936729431, "rewards/format_reward": 0.7291666865348816, "step": 456 }, { "completion_length": 2579.354248046875, "epoch": 0.5222857142857142, "grad_norm": 0.3551529347896576, "kl": 0.35302734375, "learning_rate": 1.2012473704494537e-07, "loss": 0.0275, "reward": 0.670621931552887, "reward_std": 0.6615720614790916, "rewards/cosine_scaled_reward": -0.06052236817777157, "rewards/format_reward": 0.7916666865348816, "step": 457 }, { "completion_length": 2468.854217529297, "epoch": 0.5234285714285715, "grad_norm": 0.5066484212875366, "kl": 0.423828125, "learning_rate": 1.1920622611056974e-07, "loss": 0.0466, "reward": 1.0128154456615448, "reward_std": 0.9961100518703461, "rewards/cosine_scaled_reward": 0.08974102255888283, "rewards/format_reward": 0.8333333432674408, "step": 458 }, { "completion_length": 2763.354248046875, "epoch": 0.5245714285714286, "grad_norm": 0.7024835348129272, "kl": 0.363037109375, "learning_rate": 1.1830871145697412e-07, "loss": 0.0672, "reward": 0.604728564620018, "reward_std": 0.7839554250240326, "rewards/cosine_scaled_reward": -0.11430239118635654, "rewards/format_reward": 0.833333358168602, "step": 459 }, { "completion_length": 2871.4584350585938, "epoch": 0.5257142857142857, "grad_norm": 0.6273028254508972, "kl": 0.372314453125, "learning_rate": 1.1743223682775649e-07, "loss": 0.0527, "reward": 1.0004199892282486, "reward_std": 0.8981437683105469, "rewards/cosine_scaled_reward": 0.1147933267056942, "rewards/format_reward": 0.770833358168602, "step": 460 }, { "completion_length": 2718.2709350585938, "epoch": 0.5268571428571428, "grad_norm": 0.46946173906326294, "kl": 0.447021484375, "learning_rate": 1.1657684494105386e-07, "loss": 0.0404, "reward": 1.022796869277954, "reward_std": 0.7989484220743179, "rewards/cosine_scaled_reward": 0.12598175182938576, "rewards/format_reward": 0.7708333432674408, "step": 461 }, { "completion_length": 2926.0416870117188, "epoch": 0.528, "grad_norm": 1.261118769645691, "kl": 0.525390625, "learning_rate": 1.1574257748745986e-07, "loss": 0.0831, "reward": 0.7424125671386719, "reward_std": 0.9555595070123672, "rewards/cosine_scaled_reward": -0.0037937182933092117, "rewards/format_reward": 0.7500000149011612, "step": 462 }, { "completion_length": 2262.4376220703125, "epoch": 0.5291428571428571, "grad_norm": 0.5456348657608032, "kl": 0.3070068359375, "learning_rate": 1.1492947512799328e-07, "loss": 0.0543, "reward": 1.0686239376664162, "reward_std": 0.6754159927368164, "rewards/cosine_scaled_reward": 0.1488952711224556, "rewards/format_reward": 0.7708333432674408, "step": 463 }, { "completion_length": 3016.8958740234375, "epoch": 0.5302857142857142, "grad_norm": 1.5390175580978394, "kl": 0.45947265625, "learning_rate": 1.1413757749211602e-07, "loss": 0.1119, "reward": 0.8216940313577652, "reward_std": 1.1384240239858627, "rewards/cosine_scaled_reward": 0.03584700915962458, "rewards/format_reward": 0.7500000149011612, "step": 464 }, { "completion_length": 2775.0208740234375, "epoch": 0.5314285714285715, "grad_norm": 1.5516222715377808, "kl": 0.47607421875, "learning_rate": 1.1336692317580158e-07, "loss": 0.0147, "reward": 0.7128820940852165, "reward_std": 0.8897013664245605, "rewards/cosine_scaled_reward": -0.018558980314992368, "rewards/format_reward": 0.7500000298023224, "step": 465 }, { "completion_length": 2875.3333740234375, "epoch": 0.5325714285714286, "grad_norm": 0.6315276622772217, "kl": 0.55029296875, "learning_rate": 1.1261754973965422e-07, "loss": 0.0399, "reward": 0.6401756927371025, "reward_std": 0.7611015811562538, "rewards/cosine_scaled_reward": -0.054912167601287365, "rewards/format_reward": 0.7500000149011612, "step": 466 }, { "completion_length": 2514.8750610351562, "epoch": 0.5337142857142857, "grad_norm": 0.43570035696029663, "kl": 0.39990234375, "learning_rate": 1.1188949370707787e-07, "loss": 0.0301, "reward": 0.6949951946735382, "reward_std": 0.7680038511753082, "rewards/cosine_scaled_reward": -0.06916908174753189, "rewards/format_reward": 0.8333333730697632, "step": 467 }, { "completion_length": 2586.1458740234375, "epoch": 0.5348571428571428, "grad_norm": 0.6298258304595947, "kl": 0.396484375, "learning_rate": 1.1118279056249653e-07, "loss": 0.0409, "reward": 1.2849786281585693, "reward_std": 0.9066727161407471, "rewards/cosine_scaled_reward": 0.2570726328995079, "rewards/format_reward": 0.770833358168602, "step": 468 }, { "completion_length": 2279.604248046875, "epoch": 0.536, "grad_norm": 0.42815151810646057, "kl": 0.2633056640625, "learning_rate": 1.1049747474962444e-07, "loss": 0.015, "reward": 0.784978911280632, "reward_std": 0.6496678665280342, "rewards/cosine_scaled_reward": -0.04501055763103068, "rewards/format_reward": 0.8750000149011612, "step": 469 }, { "completion_length": 2119.416748046875, "epoch": 0.5371428571428571, "grad_norm": 1.2341870069503784, "kl": 0.427001953125, "learning_rate": 1.0983357966978745e-07, "loss": 0.054, "reward": 0.6538757495582104, "reward_std": 0.8121753484010696, "rewards/cosine_scaled_reward": -0.037645455449819565, "rewards/format_reward": 0.7291666865348816, "step": 470 }, { "completion_length": 2927.5834350585938, "epoch": 0.5382857142857143, "grad_norm": 0.632990300655365, "kl": 0.61474609375, "learning_rate": 1.0919113768029517e-07, "loss": 0.0463, "reward": 0.5753965899348259, "reward_std": 0.8329771310091019, "rewards/cosine_scaled_reward": -0.1081350538879633, "rewards/format_reward": 0.7916666865348816, "step": 471 }, { "completion_length": 2382.7500610351562, "epoch": 0.5394285714285715, "grad_norm": 0.4871074855327606, "kl": 0.42333984375, "learning_rate": 1.0857018009286381e-07, "loss": 0.0518, "reward": 1.0515232384204865, "reward_std": 0.8982365727424622, "rewards/cosine_scaled_reward": 0.16117826476693153, "rewards/format_reward": 0.7291666865348816, "step": 472 }, { "completion_length": 2928.9584350585938, "epoch": 0.5405714285714286, "grad_norm": 1.3636996746063232, "kl": 0.498779296875, "learning_rate": 1.0797073717209013e-07, "loss": 0.03, "reward": 0.32807744294404984, "reward_std": 0.5049104988574982, "rewards/cosine_scaled_reward": -0.21096128597855568, "rewards/format_reward": 0.7500000149011612, "step": 473 }, { "completion_length": 2577.0625610351562, "epoch": 0.5417142857142857, "grad_norm": 1.3398447036743164, "kl": 0.351806640625, "learning_rate": 1.0739283813397639e-07, "loss": 0.0931, "reward": 0.9431183338165283, "reward_std": 0.893795982003212, "rewards/cosine_scaled_reward": 0.08614248159574345, "rewards/format_reward": 0.7708333432674408, "step": 474 }, { "completion_length": 2737.3751220703125, "epoch": 0.5428571428571428, "grad_norm": 1.3732081651687622, "kl": 0.3955078125, "learning_rate": 1.068365111445064e-07, "loss": 0.0893, "reward": 0.8586708009243011, "reward_std": 0.8809327185153961, "rewards/cosine_scaled_reward": 0.06475206837058067, "rewards/format_reward": 0.7291666865348816, "step": 475 }, { "completion_length": 2780.3125610351562, "epoch": 0.544, "grad_norm": 1.55986750125885, "kl": 0.4127197265625, "learning_rate": 1.063017833182728e-07, "loss": 0.0047, "reward": 0.8244488090276718, "reward_std": 0.7860056459903717, "rewards/cosine_scaled_reward": 0.05805772356688976, "rewards/format_reward": 0.7083333432674408, "step": 476 }, { "completion_length": 2252.229248046875, "epoch": 0.5451428571428572, "grad_norm": 0.784569263458252, "kl": 0.378082275390625, "learning_rate": 1.0578868071715544e-07, "loss": 0.0354, "reward": 1.200981080532074, "reward_std": 0.7509779334068298, "rewards/cosine_scaled_reward": 0.1734071932733059, "rewards/format_reward": 0.8541666865348816, "step": 477 }, { "completion_length": 2425.9583740234375, "epoch": 0.5462857142857143, "grad_norm": 0.4835829436779022, "kl": 0.4466552734375, "learning_rate": 1.0529722834905125e-07, "loss": 0.0496, "reward": 0.7616169229149818, "reward_std": 0.6851886659860611, "rewards/cosine_scaled_reward": -0.035858187824487686, "rewards/format_reward": 0.8333333432674408, "step": 478 }, { "completion_length": 2255.3750610351562, "epoch": 0.5474285714285714, "grad_norm": 0.9519103765487671, "kl": 0.386962890625, "learning_rate": 1.0482745016665526e-07, "loss": 0.0216, "reward": 0.9349322374910116, "reward_std": 0.613688588142395, "rewards/cosine_scaled_reward": 0.08204942103475332, "rewards/format_reward": 0.770833358168602, "step": 479 }, { "completion_length": 2591.8959350585938, "epoch": 0.5485714285714286, "grad_norm": 0.619563102722168, "kl": 0.52685546875, "learning_rate": 1.0437936906629334e-07, "loss": 0.042, "reward": 0.7943236902356148, "reward_std": 1.037893146276474, "rewards/cosine_scaled_reward": 0.06382851302623749, "rewards/format_reward": 0.6666666865348816, "step": 480 }, { "completion_length": 2677.4791870117188, "epoch": 0.5497142857142857, "grad_norm": 0.45002222061157227, "kl": 0.56689453125, "learning_rate": 1.0395300688680625e-07, "loss": 0.0405, "reward": 0.5671083256602287, "reward_std": 0.708008423447609, "rewards/cosine_scaled_reward": -0.10186250880360603, "rewards/format_reward": 0.7708333432674408, "step": 481 }, { "completion_length": 2174.2291870117188, "epoch": 0.5508571428571428, "grad_norm": 0.3016662299633026, "kl": 0.340087890625, "learning_rate": 1.0354838440848501e-07, "loss": 0.0176, "reward": 0.9943665787577629, "reward_std": 0.5935569703578949, "rewards/cosine_scaled_reward": 0.04926658235490322, "rewards/format_reward": 0.8958333432674408, "step": 482 }, { "completion_length": 2582.6459350585938, "epoch": 0.552, "grad_norm": 0.7917870879173279, "kl": 0.466064453125, "learning_rate": 1.0316552135205837e-07, "loss": 0.0279, "reward": 0.6264216639101505, "reward_std": 0.9700927287340164, "rewards/cosine_scaled_reward": -0.04095582733862102, "rewards/format_reward": 0.708333358168602, "step": 483 }, { "completion_length": 2702.2708740234375, "epoch": 0.5531428571428572, "grad_norm": 0.5935311317443848, "kl": 0.388427734375, "learning_rate": 1.0280443637773163e-07, "loss": 0.0319, "reward": 0.9062394499778748, "reward_std": 0.7218269556760788, "rewards/cosine_scaled_reward": 0.06770304590463638, "rewards/format_reward": 0.770833358168602, "step": 484 }, { "completion_length": 2429.1458435058594, "epoch": 0.5542857142857143, "grad_norm": 0.7909466028213501, "kl": 0.4248046875, "learning_rate": 1.0246514708427701e-07, "loss": 0.0119, "reward": 0.7019704282283783, "reward_std": 0.6897935420274734, "rewards/cosine_scaled_reward": -0.0448481235653162, "rewards/format_reward": 0.7916666865348816, "step": 485 }, { "completion_length": 2677.291748046875, "epoch": 0.5554285714285714, "grad_norm": 1.1475855112075806, "kl": 0.327880859375, "learning_rate": 1.0214767000817596e-07, "loss": 0.0051, "reward": 1.1072902642190456, "reward_std": 0.7692115753889084, "rewards/cosine_scaled_reward": 0.14739511162042618, "rewards/format_reward": 0.8125000149011612, "step": 486 }, { "completion_length": 2795.8750610351562, "epoch": 0.5565714285714286, "grad_norm": 0.5653597116470337, "kl": 0.3798828125, "learning_rate": 1.0185202062281336e-07, "loss": 0.0451, "reward": 0.7866236716508865, "reward_std": 0.6821945160627365, "rewards/cosine_scaled_reward": -0.02335483953356743, "rewards/format_reward": 0.833333358168602, "step": 487 }, { "completion_length": 2895.3751220703125, "epoch": 0.5577142857142857, "grad_norm": 0.4974069893360138, "kl": 0.4326171875, "learning_rate": 1.0157821333772304e-07, "loss": 0.062, "reward": 0.5221007950603962, "reward_std": 0.8605436235666275, "rewards/cosine_scaled_reward": -0.1035329382866621, "rewards/format_reward": 0.7291667014360428, "step": 488 }, { "completion_length": 2012.541748046875, "epoch": 0.5588571428571428, "grad_norm": 0.5164794921875, "kl": 0.254058837890625, "learning_rate": 1.013262614978859e-07, "loss": 0.0022, "reward": 1.416559837758541, "reward_std": 0.6288183927536011, "rewards/cosine_scaled_reward": 0.2707799021154642, "rewards/format_reward": 0.8750000149011612, "step": 489 }, { "completion_length": 2575.6668090820312, "epoch": 0.56, "grad_norm": 0.8971602916717529, "kl": 0.3701171875, "learning_rate": 1.0109617738307911e-07, "loss": 0.0111, "reward": 0.5933700278401375, "reward_std": 0.6079118028283119, "rewards/cosine_scaled_reward": -0.1408149916678667, "rewards/format_reward": 0.8750000149011612, "step": 490 }, { "completion_length": 2626.854217529297, "epoch": 0.5611428571428572, "grad_norm": 0.7071827054023743, "kl": 0.3095703125, "learning_rate": 1.0088797220727779e-07, "loss": 0.043, "reward": 0.9613501131534576, "reward_std": 0.8130423650145531, "rewards/cosine_scaled_reward": 0.07442504540085793, "rewards/format_reward": 0.8125000149011612, "step": 491 }, { "completion_length": 2436.7291870117188, "epoch": 0.5622857142857143, "grad_norm": 0.44464409351348877, "kl": 0.28173828125, "learning_rate": 1.0070165611810855e-07, "loss": 0.0244, "reward": 0.7668804228305817, "reward_std": 0.6314697042107582, "rewards/cosine_scaled_reward": -0.07489313930273056, "rewards/format_reward": 0.9166666865348816, "step": 492 }, { "completion_length": 2737.0209350585938, "epoch": 0.5634285714285714, "grad_norm": 0.5461977124214172, "kl": 0.404296875, "learning_rate": 1.005372381963547e-07, "loss": 0.038, "reward": 0.5373080670833588, "reward_std": 0.7348825931549072, "rewards/cosine_scaled_reward": -0.11676262941909954, "rewards/format_reward": 0.770833358168602, "step": 493 }, { "completion_length": 2169.5416870117188, "epoch": 0.5645714285714286, "grad_norm": 0.2975417971611023, "kl": 0.2210693359375, "learning_rate": 1.0039472645551372e-07, "loss": 0.0104, "reward": 0.6467055715620518, "reward_std": 0.6691789701581001, "rewards/cosine_scaled_reward": -0.10373054444789886, "rewards/format_reward": 0.8541666716337204, "step": 494 }, { "completion_length": 2759.9793090820312, "epoch": 0.5657142857142857, "grad_norm": 0.7536102533340454, "kl": 0.2822265625, "learning_rate": 1.002741278414069e-07, "loss": 0.0527, "reward": 1.0850744023919106, "reward_std": 0.9734541922807693, "rewards/cosine_scaled_reward": 0.13628720492124557, "rewards/format_reward": 0.8125000149011612, "step": 495 }, { "completion_length": 2828.6459350585938, "epoch": 0.5668571428571428, "grad_norm": 0.7388039231300354, "kl": 0.3896484375, "learning_rate": 1.0017544823184055e-07, "loss": 0.0107, "reward": 0.9930586367845535, "reward_std": 0.9435475766658783, "rewards/cosine_scaled_reward": 0.1215293172863312, "rewards/format_reward": 0.7500000298023224, "step": 496 }, { "completion_length": 2202.437530517578, "epoch": 0.568, "grad_norm": 0.4381030201911926, "kl": 0.25762939453125, "learning_rate": 1.0009869243631952e-07, "loss": 0.0376, "reward": 1.1173406671732664, "reward_std": 0.5638850405812263, "rewards/cosine_scaled_reward": 0.1836703196167946, "rewards/format_reward": 0.7500000149011612, "step": 497 }, { "completion_length": 2922.4376220703125, "epoch": 0.5691428571428572, "grad_norm": 0.3199293315410614, "kl": 0.4296875, "learning_rate": 1.000438641958131e-07, "loss": 0.0655, "reward": 0.23180836997926235, "reward_std": 0.6018998995423317, "rewards/cosine_scaled_reward": -0.23826248571276665, "rewards/format_reward": 0.708333358168602, "step": 498 }, { "completion_length": 2946.0626220703125, "epoch": 0.5702857142857143, "grad_norm": 0.9604411125183105, "kl": 0.41943359375, "learning_rate": 1.0001096618257236e-07, "loss": 0.068, "reward": 0.4334963224828243, "reward_std": 0.9516143649816513, "rewards/cosine_scaled_reward": -0.1270018396899104, "rewards/format_reward": 0.6875000149011612, "step": 499 }, { "completion_length": 2515.8958740234375, "epoch": 0.5714285714285714, "grad_norm": 1.0595104694366455, "kl": 0.287109375, "learning_rate": 1e-07, "loss": 0.0511, "reward": 0.935544490814209, "reward_std": 1.0099718570709229, "rewards/cosine_scaled_reward": 0.07193891797214746, "rewards/format_reward": 0.7916666865348816, "step": 500 }, { "epoch": 0.5714285714285714, "step": 500, "total_flos": 0.0, "train_loss": 0.008432806108146906, "train_runtime": 8817.9865, "train_samples_per_second": 2.722, "train_steps_per_second": 0.057 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }