| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.5714285714285714, | |
| "eval_steps": 500, | |
| "global_step": 500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "completion_length": 3001.9584350585938, | |
| "epoch": 0.001142857142857143, | |
| "grad_norm": 0.11473917961120605, | |
| "kl": 0.0, | |
| "learning_rate": 2e-08, | |
| "loss": 0.034, | |
| "reward": -0.010712452232837677, | |
| "reward_std": 0.48354096710681915, | |
| "rewards/cosine_scaled_reward": -0.1928562317043543, | |
| "rewards/format_reward": 0.37500000558793545, | |
| "step": 1 | |
| }, | |
| { | |
| "completion_length": 2822.541717529297, | |
| "epoch": 0.002285714285714286, | |
| "grad_norm": 0.17855221033096313, | |
| "kl": 0.0, | |
| "learning_rate": 4e-08, | |
| "loss": 0.1095, | |
| "reward": 0.4385625521535985, | |
| "reward_std": 0.8208381980657578, | |
| "rewards/cosine_scaled_reward": -0.009885392151772976, | |
| "rewards/format_reward": 0.4583333432674408, | |
| "step": 2 | |
| }, | |
| { | |
| "completion_length": 2903.604248046875, | |
| "epoch": 0.0034285714285714284, | |
| "grad_norm": 0.05400172621011734, | |
| "kl": 3.629922866821289e-05, | |
| "learning_rate": 6e-08, | |
| "loss": 0.0166, | |
| "reward": -0.3212598990648985, | |
| "reward_std": 0.36036985367536545, | |
| "rewards/cosine_scaled_reward": -0.3168799467384815, | |
| "rewards/format_reward": 0.31250000186264515, | |
| "step": 3 | |
| }, | |
| { | |
| "completion_length": 2924.8958740234375, | |
| "epoch": 0.004571428571428572, | |
| "grad_norm": 0.1298418492078781, | |
| "kl": 3.390759229660034e-05, | |
| "learning_rate": 8e-08, | |
| "loss": 0.0193, | |
| "reward": 0.11002232693135738, | |
| "reward_std": 0.5668230727314949, | |
| "rewards/cosine_scaled_reward": -0.12207217514514923, | |
| "rewards/format_reward": 0.3541666865348816, | |
| "step": 4 | |
| }, | |
| { | |
| "completion_length": 2699.4793090820312, | |
| "epoch": 0.005714285714285714, | |
| "grad_norm": 0.11395805329084396, | |
| "kl": 2.8192996978759766e-05, | |
| "learning_rate": 1e-07, | |
| "loss": 0.0509, | |
| "reward": 0.5249291565269232, | |
| "reward_std": 0.7597299069166183, | |
| "rewards/cosine_scaled_reward": 0.033297897316515446, | |
| "rewards/format_reward": 0.4583333544433117, | |
| "step": 5 | |
| }, | |
| { | |
| "completion_length": 2660.5001220703125, | |
| "epoch": 0.006857142857142857, | |
| "grad_norm": 0.15824902057647705, | |
| "kl": 4.559755325317383e-05, | |
| "learning_rate": 1.2e-07, | |
| "loss": 0.04, | |
| "reward": 0.42945386096835136, | |
| "reward_std": 0.6760371923446655, | |
| "rewards/cosine_scaled_reward": -0.05610641464591026, | |
| "rewards/format_reward": 0.541666679084301, | |
| "step": 6 | |
| }, | |
| { | |
| "completion_length": 2458.479217529297, | |
| "epoch": 0.008, | |
| "grad_norm": 0.10866966843605042, | |
| "kl": 2.4110078811645508e-05, | |
| "learning_rate": 1.4e-07, | |
| "loss": 0.0529, | |
| "reward": 0.7580276802182198, | |
| "reward_std": 0.6385035738348961, | |
| "rewards/cosine_scaled_reward": 0.09776384383440018, | |
| "rewards/format_reward": 0.5625000149011612, | |
| "step": 7 | |
| }, | |
| { | |
| "completion_length": 2977.8126220703125, | |
| "epoch": 0.009142857142857144, | |
| "grad_norm": 0.22230574488639832, | |
| "kl": 3.574788570404053e-05, | |
| "learning_rate": 1.6e-07, | |
| "loss": 0.0993, | |
| "reward": 0.06304685212671757, | |
| "reward_std": 0.8850619196891785, | |
| "rewards/cosine_scaled_reward": -0.16639323788695037, | |
| "rewards/format_reward": 0.3958333395421505, | |
| "step": 8 | |
| }, | |
| { | |
| "completion_length": 3034.5416870117188, | |
| "epoch": 0.010285714285714285, | |
| "grad_norm": 0.17408320307731628, | |
| "kl": 3.820657730102539e-05, | |
| "learning_rate": 1.8e-07, | |
| "loss": 0.0851, | |
| "reward": 0.06854809075593948, | |
| "reward_std": 0.8176102936267853, | |
| "rewards/cosine_scaled_reward": -0.10114264115691185, | |
| "rewards/format_reward": 0.2708333432674408, | |
| "step": 9 | |
| }, | |
| { | |
| "completion_length": 2121.2500610351562, | |
| "epoch": 0.011428571428571429, | |
| "grad_norm": 0.07089601457118988, | |
| "kl": 2.7008354663848877e-05, | |
| "learning_rate": 2e-07, | |
| "loss": 0.026, | |
| "reward": 0.6565612219274044, | |
| "reward_std": 0.6731352433562279, | |
| "rewards/cosine_scaled_reward": 0.026197269558906555, | |
| "rewards/format_reward": 0.6041666716337204, | |
| "step": 10 | |
| }, | |
| { | |
| "completion_length": 2388.166748046875, | |
| "epoch": 0.012571428571428572, | |
| "grad_norm": 0.17368823289871216, | |
| "kl": 2.911686897277832e-05, | |
| "learning_rate": 2.1999999999999998e-07, | |
| "loss": 0.09, | |
| "reward": 0.7517527863383293, | |
| "reward_std": 1.0614946484565735, | |
| "rewards/cosine_scaled_reward": 0.07379304803907871, | |
| "rewards/format_reward": 0.6041666865348816, | |
| "step": 11 | |
| }, | |
| { | |
| "completion_length": 2672.5834350585938, | |
| "epoch": 0.013714285714285714, | |
| "grad_norm": 0.09804865717887878, | |
| "kl": 3.5643577575683594e-05, | |
| "learning_rate": 2.4e-07, | |
| "loss": 0.0423, | |
| "reward": 0.46549332328140736, | |
| "reward_std": 0.59340400993824, | |
| "rewards/cosine_scaled_reward": -0.006836682558059692, | |
| "rewards/format_reward": 0.479166679084301, | |
| "step": 12 | |
| }, | |
| { | |
| "completion_length": 2250.187530517578, | |
| "epoch": 0.014857142857142857, | |
| "grad_norm": 0.10080444812774658, | |
| "kl": 3.0308961868286133e-05, | |
| "learning_rate": 2.6e-07, | |
| "loss": 0.0188, | |
| "reward": 0.6889139215054456, | |
| "reward_std": 0.8085261583328247, | |
| "rewards/cosine_scaled_reward": 0.06320697697810829, | |
| "rewards/format_reward": 0.5625000111758709, | |
| "step": 13 | |
| }, | |
| { | |
| "completion_length": 2936.9375610351562, | |
| "epoch": 0.016, | |
| "grad_norm": 0.1032668873667717, | |
| "kl": 4.1931867599487305e-05, | |
| "learning_rate": 2.8e-07, | |
| "loss": 0.0156, | |
| "reward": 0.10788557305932045, | |
| "reward_std": 0.6920560002326965, | |
| "rewards/cosine_scaled_reward": -0.11272389208897948, | |
| "rewards/format_reward": 0.3333333432674408, | |
| "step": 14 | |
| }, | |
| { | |
| "completion_length": 3221.666748046875, | |
| "epoch": 0.017142857142857144, | |
| "grad_norm": 0.10653272271156311, | |
| "kl": 3.7223100662231445e-05, | |
| "learning_rate": 3e-07, | |
| "loss": 0.0008, | |
| "reward": -0.2332199066877365, | |
| "reward_std": 0.63228340446949, | |
| "rewards/cosine_scaled_reward": -0.21035997135186335, | |
| "rewards/format_reward": 0.1875000111758709, | |
| "step": 15 | |
| }, | |
| { | |
| "completion_length": 2321.3750610351562, | |
| "epoch": 0.018285714285714287, | |
| "grad_norm": 0.14373674988746643, | |
| "kl": 2.193450927734375e-05, | |
| "learning_rate": 3.2e-07, | |
| "loss": 0.0532, | |
| "reward": 0.6621312350034714, | |
| "reward_std": 0.9647989273071289, | |
| "rewards/cosine_scaled_reward": 0.06023227237164974, | |
| "rewards/format_reward": 0.5416666865348816, | |
| "step": 16 | |
| }, | |
| { | |
| "completion_length": 3174.8333740234375, | |
| "epoch": 0.019428571428571427, | |
| "grad_norm": 0.07878188043832779, | |
| "kl": 3.62396240234375e-05, | |
| "learning_rate": 3.4000000000000003e-07, | |
| "loss": 0.0239, | |
| "reward": -0.20133600383996964, | |
| "reward_std": 0.5479727387428284, | |
| "rewards/cosine_scaled_reward": -0.2152513451874256, | |
| "rewards/format_reward": 0.2291666679084301, | |
| "step": 17 | |
| }, | |
| { | |
| "completion_length": 3214.229248046875, | |
| "epoch": 0.02057142857142857, | |
| "grad_norm": 0.1723223179578781, | |
| "kl": 5.7220458984375e-05, | |
| "learning_rate": 3.6e-07, | |
| "loss": 0.0648, | |
| "reward": -0.21091226488351822, | |
| "reward_std": 0.5157570615410805, | |
| "rewards/cosine_scaled_reward": -0.188789464533329, | |
| "rewards/format_reward": 0.1666666679084301, | |
| "step": 18 | |
| }, | |
| { | |
| "completion_length": 3238.9584350585938, | |
| "epoch": 0.021714285714285714, | |
| "grad_norm": 0.161203071475029, | |
| "kl": 2.1696090698242188e-05, | |
| "learning_rate": 3.7999999999999996e-07, | |
| "loss": 0.0562, | |
| "reward": 0.049652623711153865, | |
| "reward_std": 0.9271627813577652, | |
| "rewards/cosine_scaled_reward": -0.1210070364177227, | |
| "rewards/format_reward": 0.2916666679084301, | |
| "step": 19 | |
| }, | |
| { | |
| "completion_length": 2502.9584045410156, | |
| "epoch": 0.022857142857142857, | |
| "grad_norm": 0.19064471125602722, | |
| "kl": 3.2901763916015625e-05, | |
| "learning_rate": 4e-07, | |
| "loss": 0.097, | |
| "reward": 0.33966562896966934, | |
| "reward_std": 0.6814321130514145, | |
| "rewards/cosine_scaled_reward": -0.10100051760673523, | |
| "rewards/format_reward": 0.5416666865348816, | |
| "step": 20 | |
| }, | |
| { | |
| "completion_length": 2544.5833740234375, | |
| "epoch": 0.024, | |
| "grad_norm": 0.08170344680547714, | |
| "kl": 2.512335777282715e-05, | |
| "learning_rate": 4.1999999999999995e-07, | |
| "loss": 0.0099, | |
| "reward": 0.26008715480566025, | |
| "reward_std": 0.5456661060452461, | |
| "rewards/cosine_scaled_reward": -0.06787310540676117, | |
| "rewards/format_reward": 0.3958333432674408, | |
| "step": 21 | |
| }, | |
| { | |
| "completion_length": 3508.8126220703125, | |
| "epoch": 0.025142857142857144, | |
| "grad_norm": 0.14452184736728668, | |
| "kl": 2.7313828468322754e-05, | |
| "learning_rate": 4.3999999999999997e-07, | |
| "loss": 0.0283, | |
| "reward": -0.03812084347009659, | |
| "reward_std": 0.7810813337564468, | |
| "rewards/cosine_scaled_reward": -0.10239375196397305, | |
| "rewards/format_reward": 0.16666666977107525, | |
| "step": 22 | |
| }, | |
| { | |
| "completion_length": 3135.5000610351562, | |
| "epoch": 0.026285714285714287, | |
| "grad_norm": 0.18309734761714935, | |
| "kl": 4.690885543823242e-05, | |
| "learning_rate": 4.6e-07, | |
| "loss": 0.0864, | |
| "reward": -0.03534786030650139, | |
| "reward_std": 0.8103697001934052, | |
| "rewards/cosine_scaled_reward": -0.17392393667250872, | |
| "rewards/format_reward": 0.3125000111758709, | |
| "step": 23 | |
| }, | |
| { | |
| "completion_length": 2123.3750915527344, | |
| "epoch": 0.027428571428571427, | |
| "grad_norm": 0.07949961721897125, | |
| "kl": 1.4767050743103027e-05, | |
| "learning_rate": 4.8e-07, | |
| "loss": 0.0269, | |
| "reward": 0.6402075001969934, | |
| "reward_std": 0.7203418090939522, | |
| "rewards/cosine_scaled_reward": 0.018020419403910637, | |
| "rewards/format_reward": 0.6041666679084301, | |
| "step": 24 | |
| }, | |
| { | |
| "completion_length": 2792.7709045410156, | |
| "epoch": 0.02857142857142857, | |
| "grad_norm": 0.09897608309984207, | |
| "kl": 1.7628073692321777e-05, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0263, | |
| "reward": 0.3667532876133919, | |
| "reward_std": 0.5270465165376663, | |
| "rewards/cosine_scaled_reward": -0.03537335619330406, | |
| "rewards/format_reward": 0.43750000558793545, | |
| "step": 25 | |
| }, | |
| { | |
| "completion_length": 3103.5416870117188, | |
| "epoch": 0.029714285714285714, | |
| "grad_norm": 0.15197034180164337, | |
| "kl": 1.8015503883361816e-05, | |
| "learning_rate": 5.2e-07, | |
| "loss": 0.0517, | |
| "reward": 0.23722141981124878, | |
| "reward_std": 0.826317235827446, | |
| "rewards/cosine_scaled_reward": -0.027222641743719578, | |
| "rewards/format_reward": 0.29166667722165585, | |
| "step": 26 | |
| }, | |
| { | |
| "completion_length": 3099.729248046875, | |
| "epoch": 0.030857142857142857, | |
| "grad_norm": 0.11937292665243149, | |
| "kl": 2.5153160095214844e-05, | |
| "learning_rate": 5.4e-07, | |
| "loss": 0.0279, | |
| "reward": -0.05506348796188831, | |
| "reward_std": 0.483004167675972, | |
| "rewards/cosine_scaled_reward": -0.14211508259177208, | |
| "rewards/format_reward": 0.2291666716337204, | |
| "step": 27 | |
| }, | |
| { | |
| "completion_length": 3221.7291870117188, | |
| "epoch": 0.032, | |
| "grad_norm": 0.1231866255402565, | |
| "kl": 2.6211142539978027e-05, | |
| "learning_rate": 5.6e-07, | |
| "loss": -0.0031, | |
| "reward": 0.19264543801546097, | |
| "reward_std": 0.7934563755989075, | |
| "rewards/cosine_scaled_reward": -0.07034394145011902, | |
| "rewards/format_reward": 0.33333334885537624, | |
| "step": 28 | |
| }, | |
| { | |
| "completion_length": 3130.6459350585938, | |
| "epoch": 0.03314285714285714, | |
| "grad_norm": 0.14249049127101898, | |
| "kl": 2.726912498474121e-06, | |
| "learning_rate": 5.8e-07, | |
| "loss": 0.0394, | |
| "reward": 0.20082764513790607, | |
| "reward_std": 1.0230832546949387, | |
| "rewards/cosine_scaled_reward": -0.06625284859910607, | |
| "rewards/format_reward": 0.33333334513008595, | |
| "step": 29 | |
| }, | |
| { | |
| "completion_length": 3211.125, | |
| "epoch": 0.03428571428571429, | |
| "grad_norm": 0.11244227737188339, | |
| "kl": 2.047419548034668e-05, | |
| "learning_rate": 6e-07, | |
| "loss": 0.0135, | |
| "reward": 0.11087529244832695, | |
| "reward_std": 0.6219374239444733, | |
| "rewards/cosine_scaled_reward": -0.09039569273591042, | |
| "rewards/format_reward": 0.29166667722165585, | |
| "step": 30 | |
| }, | |
| { | |
| "completion_length": 2505.687530517578, | |
| "epoch": 0.03542857142857143, | |
| "grad_norm": 0.10730752348899841, | |
| "kl": 2.9802322387695312e-05, | |
| "learning_rate": 6.2e-07, | |
| "loss": 0.0711, | |
| "reward": 0.10028511472046375, | |
| "reward_std": 0.7022345140576363, | |
| "rewards/cosine_scaled_reward": -0.1686074547469616, | |
| "rewards/format_reward": 0.4375000074505806, | |
| "step": 31 | |
| }, | |
| { | |
| "completion_length": 3546.5, | |
| "epoch": 0.036571428571428574, | |
| "grad_norm": 0.08949411660432816, | |
| "kl": 2.053380012512207e-05, | |
| "learning_rate": 6.4e-07, | |
| "loss": 0.0081, | |
| "reward": -0.4492787718772888, | |
| "reward_std": 0.4731578528881073, | |
| "rewards/cosine_scaled_reward": -0.2454727292060852, | |
| "rewards/format_reward": 0.0416666679084301, | |
| "step": 32 | |
| }, | |
| { | |
| "completion_length": 3140.4584350585938, | |
| "epoch": 0.037714285714285714, | |
| "grad_norm": 0.15533116459846497, | |
| "kl": 1.6998499631881714e-05, | |
| "learning_rate": 6.6e-07, | |
| "loss": 0.0936, | |
| "reward": 0.14784683287143707, | |
| "reward_std": 0.8761000260710716, | |
| "rewards/cosine_scaled_reward": -0.10315992683172226, | |
| "rewards/format_reward": 0.3541666716337204, | |
| "step": 33 | |
| }, | |
| { | |
| "completion_length": 3067.5208740234375, | |
| "epoch": 0.038857142857142854, | |
| "grad_norm": 0.05691331624984741, | |
| "kl": 7.178634405136108e-06, | |
| "learning_rate": 6.800000000000001e-07, | |
| "loss": 0.0164, | |
| "reward": -0.4488837197422981, | |
| "reward_std": 0.4332050681114197, | |
| "rewards/cosine_scaled_reward": -0.31819187104701996, | |
| "rewards/format_reward": 0.1875, | |
| "step": 34 | |
| }, | |
| { | |
| "completion_length": 2977.979248046875, | |
| "epoch": 0.04, | |
| "grad_norm": 0.13275845348834991, | |
| "kl": 2.034008502960205e-05, | |
| "learning_rate": 7e-07, | |
| "loss": 0.0431, | |
| "reward": 0.19347557425498962, | |
| "reward_std": 0.7837567403912544, | |
| "rewards/cosine_scaled_reward": -0.11159555055201054, | |
| "rewards/format_reward": 0.4166666828095913, | |
| "step": 35 | |
| }, | |
| { | |
| "completion_length": 2511.7500610351562, | |
| "epoch": 0.04114285714285714, | |
| "grad_norm": 0.08902338147163391, | |
| "kl": 7.106363773345947e-05, | |
| "learning_rate": 7.2e-07, | |
| "loss": 0.0143, | |
| "reward": 0.6313629895448685, | |
| "reward_std": 0.4862937852740288, | |
| "rewards/cosine_scaled_reward": 0.06568148266524076, | |
| "rewards/format_reward": 0.5000000055879354, | |
| "step": 36 | |
| }, | |
| { | |
| "completion_length": 2623.6458435058594, | |
| "epoch": 0.04228571428571429, | |
| "grad_norm": 0.12060169875621796, | |
| "kl": 6.20037317276001e-05, | |
| "learning_rate": 7.4e-07, | |
| "loss": 0.0323, | |
| "reward": 0.4485716000199318, | |
| "reward_std": 0.8753202259540558, | |
| "rewards/cosine_scaled_reward": -0.0361308753490448, | |
| "rewards/format_reward": 0.5208333414047956, | |
| "step": 37 | |
| }, | |
| { | |
| "completion_length": 3015.5625610351562, | |
| "epoch": 0.04342857142857143, | |
| "grad_norm": 0.10110022872686386, | |
| "kl": 0.00016170740127563477, | |
| "learning_rate": 7.599999999999999e-07, | |
| "loss": 0.0344, | |
| "reward": -0.068646389991045, | |
| "reward_std": 0.6391054093837738, | |
| "rewards/cosine_scaled_reward": -0.22182317543774843, | |
| "rewards/format_reward": 0.37500002048909664, | |
| "step": 38 | |
| }, | |
| { | |
| "completion_length": 2867.5208740234375, | |
| "epoch": 0.044571428571428574, | |
| "grad_norm": 0.15215592086315155, | |
| "kl": 0.00011932849884033203, | |
| "learning_rate": 7.799999999999999e-07, | |
| "loss": 0.1002, | |
| "reward": 0.14817129005677998, | |
| "reward_std": 0.7805476784706116, | |
| "rewards/cosine_scaled_reward": -0.12383103743195534, | |
| "rewards/format_reward": 0.39583333395421505, | |
| "step": 39 | |
| }, | |
| { | |
| "completion_length": 3186.5000610351562, | |
| "epoch": 0.045714285714285714, | |
| "grad_norm": 0.11930648982524872, | |
| "kl": 0.00010547041893005371, | |
| "learning_rate": 8e-07, | |
| "loss": 0.0178, | |
| "reward": -0.03248624689877033, | |
| "reward_std": 0.63504558801651, | |
| "rewards/cosine_scaled_reward": -0.16207645926624537, | |
| "rewards/format_reward": 0.29166666977107525, | |
| "step": 40 | |
| }, | |
| { | |
| "completion_length": 3180.8959350585938, | |
| "epoch": 0.046857142857142854, | |
| "grad_norm": 0.18630492687225342, | |
| "kl": 3.663450479507446e-05, | |
| "learning_rate": 8.199999999999999e-07, | |
| "loss": 0.0553, | |
| "reward": 0.43380990624427795, | |
| "reward_std": 0.8565632924437523, | |
| "rewards/cosine_scaled_reward": 0.018988274037837982, | |
| "rewards/format_reward": 0.3958333432674408, | |
| "step": 41 | |
| }, | |
| { | |
| "completion_length": 2079.1041870117188, | |
| "epoch": 0.048, | |
| "grad_norm": 0.11225883662700653, | |
| "kl": 0.0004626065492630005, | |
| "learning_rate": 8.399999999999999e-07, | |
| "loss": 0.0429, | |
| "reward": 0.8895847648382187, | |
| "reward_std": 0.764504998922348, | |
| "rewards/cosine_scaled_reward": 0.11145903076976538, | |
| "rewards/format_reward": 0.6666666828095913, | |
| "step": 42 | |
| }, | |
| { | |
| "completion_length": 3000.166748046875, | |
| "epoch": 0.04914285714285714, | |
| "grad_norm": 0.24759933352470398, | |
| "kl": 0.00012095272541046143, | |
| "learning_rate": 8.599999999999999e-07, | |
| "loss": 0.064, | |
| "reward": 0.32910796254873276, | |
| "reward_std": 1.0378518775105476, | |
| "rewards/cosine_scaled_reward": -0.03336267964914441, | |
| "rewards/format_reward": 0.39583333395421505, | |
| "step": 43 | |
| }, | |
| { | |
| "completion_length": 2956.9375610351562, | |
| "epoch": 0.05028571428571429, | |
| "grad_norm": 0.28840357065200806, | |
| "kl": 0.0008223056793212891, | |
| "learning_rate": 8.799999999999999e-07, | |
| "loss": 0.0814, | |
| "reward": 0.2166026197373867, | |
| "reward_std": 0.745319314301014, | |
| "rewards/cosine_scaled_reward": -0.10003203712403774, | |
| "rewards/format_reward": 0.4166666828095913, | |
| "step": 44 | |
| }, | |
| { | |
| "completion_length": 2793.9583740234375, | |
| "epoch": 0.05142857142857143, | |
| "grad_norm": 0.1415959894657135, | |
| "kl": 6.61015510559082e-05, | |
| "learning_rate": 9e-07, | |
| "loss": 0.0791, | |
| "reward": 0.6622170452028513, | |
| "reward_std": 0.8223324418067932, | |
| "rewards/cosine_scaled_reward": 0.08110851421952248, | |
| "rewards/format_reward": 0.5000000074505806, | |
| "step": 45 | |
| }, | |
| { | |
| "completion_length": 2979.9583740234375, | |
| "epoch": 0.052571428571428575, | |
| "grad_norm": 0.10514923185110092, | |
| "kl": 0.00029647350311279297, | |
| "learning_rate": 9.2e-07, | |
| "loss": 0.0303, | |
| "reward": 0.2565183639526367, | |
| "reward_std": 0.5196356028318405, | |
| "rewards/cosine_scaled_reward": -0.03840749338269234, | |
| "rewards/format_reward": 0.3333333432674408, | |
| "step": 46 | |
| }, | |
| { | |
| "completion_length": 2650.1458587646484, | |
| "epoch": 0.053714285714285714, | |
| "grad_norm": 0.15202954411506653, | |
| "kl": 0.0002989917993545532, | |
| "learning_rate": 9.399999999999999e-07, | |
| "loss": 0.0978, | |
| "reward": 0.6011475473642349, | |
| "reward_std": 0.908449612557888, | |
| "rewards/cosine_scaled_reward": 0.04015708714723587, | |
| "rewards/format_reward": 0.520833358168602, | |
| "step": 47 | |
| }, | |
| { | |
| "completion_length": 2887.9584350585938, | |
| "epoch": 0.054857142857142854, | |
| "grad_norm": 0.14365942776203156, | |
| "kl": 0.0003235340118408203, | |
| "learning_rate": 9.6e-07, | |
| "loss": 0.1098, | |
| "reward": 0.3464186545461416, | |
| "reward_std": 0.8909324407577515, | |
| "rewards/cosine_scaled_reward": -0.02470733504742384, | |
| "rewards/format_reward": 0.3958333395421505, | |
| "step": 48 | |
| }, | |
| { | |
| "completion_length": 2294.9792098999023, | |
| "epoch": 0.056, | |
| "grad_norm": 0.12595273554325104, | |
| "kl": 0.0003814399242401123, | |
| "learning_rate": 9.8e-07, | |
| "loss": 0.0294, | |
| "reward": 0.3887506239116192, | |
| "reward_std": 0.709479071199894, | |
| "rewards/cosine_scaled_reward": -0.08687468431890011, | |
| "rewards/format_reward": 0.5625000074505806, | |
| "step": 49 | |
| }, | |
| { | |
| "completion_length": 2483.5834350585938, | |
| "epoch": 0.05714285714285714, | |
| "grad_norm": 0.1347932517528534, | |
| "kl": 0.0020999908447265625, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0097, | |
| "reward": 0.48975098691880703, | |
| "reward_std": 0.7372790724039078, | |
| "rewards/cosine_scaled_reward": 0.015708832070231438, | |
| "rewards/format_reward": 0.4583333432674408, | |
| "step": 50 | |
| }, | |
| { | |
| "completion_length": 3298.0208740234375, | |
| "epoch": 0.05828571428571429, | |
| "grad_norm": 0.12307793647050858, | |
| "kl": 0.0010235309600830078, | |
| "learning_rate": 9.999890338174275e-07, | |
| "loss": 0.0291, | |
| "reward": -0.01613167393952608, | |
| "reward_std": 0.7748741805553436, | |
| "rewards/cosine_scaled_reward": -0.1538991741836071, | |
| "rewards/format_reward": 0.2916666716337204, | |
| "step": 51 | |
| }, | |
| { | |
| "completion_length": 3464.9375610351562, | |
| "epoch": 0.05942857142857143, | |
| "grad_norm": 0.14545413851737976, | |
| "kl": 0.0018963813781738281, | |
| "learning_rate": 9.999561358041868e-07, | |
| "loss": 0.0448, | |
| "reward": -0.23706040158867836, | |
| "reward_std": 0.7933510839939117, | |
| "rewards/cosine_scaled_reward": -0.18103019893169403, | |
| "rewards/format_reward": 0.12500000186264515, | |
| "step": 52 | |
| }, | |
| { | |
| "completion_length": 2938.2084350585938, | |
| "epoch": 0.060571428571428575, | |
| "grad_norm": 0.08684064447879791, | |
| "kl": 0.0016429424285888672, | |
| "learning_rate": 9.999013075636804e-07, | |
| "loss": 0.0463, | |
| "reward": -0.056304458528757095, | |
| "reward_std": 0.5842409431934357, | |
| "rewards/cosine_scaled_reward": -0.18440223019570112, | |
| "rewards/format_reward": 0.3125000111758709, | |
| "step": 53 | |
| }, | |
| { | |
| "completion_length": 2789.2916870117188, | |
| "epoch": 0.061714285714285715, | |
| "grad_norm": 0.060190364718437195, | |
| "kl": 0.0017528533935546875, | |
| "learning_rate": 9.998245517681593e-07, | |
| "loss": 0.0171, | |
| "reward": -0.11311334511265159, | |
| "reward_std": 0.42039141058921814, | |
| "rewards/cosine_scaled_reward": -0.23364001512527466, | |
| "rewards/format_reward": 0.3541666716337204, | |
| "step": 54 | |
| }, | |
| { | |
| "completion_length": 3271.5625, | |
| "epoch": 0.06285714285714286, | |
| "grad_norm": 0.07129888236522675, | |
| "kl": 0.0009405612945556641, | |
| "learning_rate": 9.997258721585931e-07, | |
| "loss": 0.0159, | |
| "reward": -0.34992948174476624, | |
| "reward_std": 0.4250538572669029, | |
| "rewards/cosine_scaled_reward": -0.24788140505552292, | |
| "rewards/format_reward": 0.14583333395421505, | |
| "step": 55 | |
| }, | |
| { | |
| "completion_length": 3073.604248046875, | |
| "epoch": 0.064, | |
| "grad_norm": 0.16036204993724823, | |
| "kl": 0.0025844573974609375, | |
| "learning_rate": 9.996052735444862e-07, | |
| "loss": 0.0509, | |
| "reward": 0.015420392155647278, | |
| "reward_std": 0.7796643078327179, | |
| "rewards/cosine_scaled_reward": -0.11728980112820864, | |
| "rewards/format_reward": 0.25000000558793545, | |
| "step": 56 | |
| }, | |
| { | |
| "completion_length": 3082.9584350585938, | |
| "epoch": 0.06514285714285714, | |
| "grad_norm": 0.14083248376846313, | |
| "kl": 0.010837554931640625, | |
| "learning_rate": 9.994627618036452e-07, | |
| "loss": 0.0506, | |
| "reward": 0.042304279981181026, | |
| "reward_std": 0.7727529257535934, | |
| "rewards/cosine_scaled_reward": -0.13509786408394575, | |
| "rewards/format_reward": 0.3125000111758709, | |
| "step": 57 | |
| }, | |
| { | |
| "completion_length": 3073.3541870117188, | |
| "epoch": 0.06628571428571428, | |
| "grad_norm": 0.16678181290626526, | |
| "kl": 0.003218412399291992, | |
| "learning_rate": 9.992983438818915e-07, | |
| "loss": 0.0068, | |
| "reward": 0.20121465623378754, | |
| "reward_std": 0.7175656408071518, | |
| "rewards/cosine_scaled_reward": -0.055642676539719105, | |
| "rewards/format_reward": 0.3125000074505806, | |
| "step": 58 | |
| }, | |
| { | |
| "completion_length": 3008.2709350585938, | |
| "epoch": 0.06742857142857143, | |
| "grad_norm": 0.1475798785686493, | |
| "kl": 0.009433746337890625, | |
| "learning_rate": 9.991120277927223e-07, | |
| "loss": 0.0125, | |
| "reward": 0.4367425888776779, | |
| "reward_std": 0.647830456495285, | |
| "rewards/cosine_scaled_reward": 0.06212127208709717, | |
| "rewards/format_reward": 0.3125000111758709, | |
| "step": 59 | |
| }, | |
| { | |
| "completion_length": 2855.6666870117188, | |
| "epoch": 0.06857142857142857, | |
| "grad_norm": 0.09679862856864929, | |
| "kl": 0.00621795654296875, | |
| "learning_rate": 9.989038226169207e-07, | |
| "loss": 0.0164, | |
| "reward": 0.47872328013181686, | |
| "reward_std": 0.5911416038870811, | |
| "rewards/cosine_scaled_reward": 0.0622783238068223, | |
| "rewards/format_reward": 0.35416667722165585, | |
| "step": 60 | |
| }, | |
| { | |
| "completion_length": 2144.3750610351562, | |
| "epoch": 0.06971428571428571, | |
| "grad_norm": 0.05888332054018974, | |
| "kl": 0.0020198822021484375, | |
| "learning_rate": 9.98673738502114e-07, | |
| "loss": 0.0166, | |
| "reward": 1.0081715881824493, | |
| "reward_std": 0.5063923448324203, | |
| "rewards/cosine_scaled_reward": 0.19158576428890228, | |
| "rewards/format_reward": 0.625, | |
| "step": 61 | |
| }, | |
| { | |
| "completion_length": 3236.3125610351562, | |
| "epoch": 0.07085714285714285, | |
| "grad_norm": 0.16112229228019714, | |
| "kl": 0.0008752346038818359, | |
| "learning_rate": 9.98421786662277e-07, | |
| "loss": 0.0491, | |
| "reward": 0.47718358784914017, | |
| "reward_std": 0.9821799397468567, | |
| "rewards/cosine_scaled_reward": 0.01984177529811859, | |
| "rewards/format_reward": 0.4375000149011612, | |
| "step": 62 | |
| }, | |
| { | |
| "completion_length": 2407.416748046875, | |
| "epoch": 0.072, | |
| "grad_norm": 0.09190040081739426, | |
| "kl": 0.009485244750976562, | |
| "learning_rate": 9.981479793771866e-07, | |
| "loss": 0.0014, | |
| "reward": 0.6355759827420115, | |
| "reward_std": 0.5608287900686264, | |
| "rewards/cosine_scaled_reward": 0.026121314615011215, | |
| "rewards/format_reward": 0.5833333358168602, | |
| "step": 63 | |
| }, | |
| { | |
| "completion_length": 3042.2708740234375, | |
| "epoch": 0.07314285714285715, | |
| "grad_norm": 0.11311787366867065, | |
| "kl": 0.0009531974792480469, | |
| "learning_rate": 9.97852329991824e-07, | |
| "loss": 0.0158, | |
| "reward": 0.624295711517334, | |
| "reward_std": 0.6829620823264122, | |
| "rewards/cosine_scaled_reward": 0.1142311654984951, | |
| "rewards/format_reward": 0.3958333358168602, | |
| "step": 64 | |
| }, | |
| { | |
| "completion_length": 2979.9376220703125, | |
| "epoch": 0.07428571428571429, | |
| "grad_norm": 0.17287743091583252, | |
| "kl": 0.008108139038085938, | |
| "learning_rate": 9.975348529157229e-07, | |
| "loss": 0.0803, | |
| "reward": 0.3465092070400715, | |
| "reward_std": 0.8748672604560852, | |
| "rewards/cosine_scaled_reward": -0.01424538716673851, | |
| "rewards/format_reward": 0.37500000558793545, | |
| "step": 65 | |
| }, | |
| { | |
| "completion_length": 3180.5833740234375, | |
| "epoch": 0.07542857142857143, | |
| "grad_norm": 0.13114774227142334, | |
| "kl": 0.0013761520385742188, | |
| "learning_rate": 9.971955636222684e-07, | |
| "loss": 0.011, | |
| "reward": 0.197968615218997, | |
| "reward_std": 0.808275930583477, | |
| "rewards/cosine_scaled_reward": -0.07809901610016823, | |
| "rewards/format_reward": 0.3541666679084301, | |
| "step": 66 | |
| }, | |
| { | |
| "completion_length": 3045.2709350585938, | |
| "epoch": 0.07657142857142857, | |
| "grad_norm": 0.16203086078166962, | |
| "kl": 0.0018739700317382812, | |
| "learning_rate": 9.968344786479415e-07, | |
| "loss": 0.0847, | |
| "reward": 0.6482307966798544, | |
| "reward_std": 1.029038056731224, | |
| "rewards/cosine_scaled_reward": 0.10536541882902384, | |
| "rewards/format_reward": 0.43750001676380634, | |
| "step": 67 | |
| }, | |
| { | |
| "completion_length": 2498.9166870117188, | |
| "epoch": 0.07771428571428571, | |
| "grad_norm": 0.06138293072581291, | |
| "kl": 0.00598907470703125, | |
| "learning_rate": 9.964516155915151e-07, | |
| "loss": -0.0034, | |
| "reward": 0.12972787162289023, | |
| "reward_std": 0.5004179775714874, | |
| "rewards/cosine_scaled_reward": -0.12263606488704681, | |
| "rewards/format_reward": 0.375, | |
| "step": 68 | |
| }, | |
| { | |
| "completion_length": 2794.0834350585938, | |
| "epoch": 0.07885714285714286, | |
| "grad_norm": 0.1431104838848114, | |
| "kl": 0.005124092102050781, | |
| "learning_rate": 9.960469931131936e-07, | |
| "loss": 0.0283, | |
| "reward": 0.5150027610361576, | |
| "reward_std": 0.6274815611541271, | |
| "rewards/cosine_scaled_reward": 0.01791803538799286, | |
| "rewards/format_reward": 0.4791666716337204, | |
| "step": 69 | |
| }, | |
| { | |
| "completion_length": 3173.8125610351562, | |
| "epoch": 0.08, | |
| "grad_norm": 0.146661639213562, | |
| "kl": 0.0033349990844726562, | |
| "learning_rate": 9.956206309337066e-07, | |
| "loss": 0.0731, | |
| "reward": 0.23033593781292439, | |
| "reward_std": 0.7032231390476227, | |
| "rewards/cosine_scaled_reward": -0.04108203295618296, | |
| "rewards/format_reward": 0.31250000186264515, | |
| "step": 70 | |
| }, | |
| { | |
| "completion_length": 3088.0834350585938, | |
| "epoch": 0.08114285714285714, | |
| "grad_norm": 0.1698896586894989, | |
| "kl": 0.005756378173828125, | |
| "learning_rate": 9.951725498333448e-07, | |
| "loss": 0.0857, | |
| "reward": 0.4810620807111263, | |
| "reward_std": 0.7472349628806114, | |
| "rewards/cosine_scaled_reward": 0.032197702676057816, | |
| "rewards/format_reward": 0.4166666828095913, | |
| "step": 71 | |
| }, | |
| { | |
| "completion_length": 2835.9583740234375, | |
| "epoch": 0.08228571428571428, | |
| "grad_norm": 0.15748044848442078, | |
| "kl": 0.005644321441650391, | |
| "learning_rate": 9.947027716509488e-07, | |
| "loss": 0.0414, | |
| "reward": 0.39926697919145226, | |
| "reward_std": 0.7735992036759853, | |
| "rewards/cosine_scaled_reward": -0.029533179476857185, | |
| "rewards/format_reward": 0.4583333395421505, | |
| "step": 72 | |
| }, | |
| { | |
| "completion_length": 2668.854248046875, | |
| "epoch": 0.08342857142857144, | |
| "grad_norm": 0.2273511439561844, | |
| "kl": 0.0141448974609375, | |
| "learning_rate": 9.942113192828444e-07, | |
| "loss": 0.1186, | |
| "reward": 0.6719660833477974, | |
| "reward_std": 0.9455910921096802, | |
| "rewards/cosine_scaled_reward": 0.0859830379486084, | |
| "rewards/format_reward": 0.5000000298023224, | |
| "step": 73 | |
| }, | |
| { | |
| "completion_length": 2737.291748046875, | |
| "epoch": 0.08457142857142858, | |
| "grad_norm": 0.16039791703224182, | |
| "kl": 0.007320404052734375, | |
| "learning_rate": 9.93698216681727e-07, | |
| "loss": 0.0567, | |
| "reward": 0.31655584648251534, | |
| "reward_std": 0.6061973124742508, | |
| "rewards/cosine_scaled_reward": -0.03963874280452728, | |
| "rewards/format_reward": 0.3958333395421505, | |
| "step": 74 | |
| }, | |
| { | |
| "completion_length": 2990.854248046875, | |
| "epoch": 0.08571428571428572, | |
| "grad_norm": 0.22528968751430511, | |
| "kl": 0.007213592529296875, | |
| "learning_rate": 9.931634888554935e-07, | |
| "loss": 0.1029, | |
| "reward": 0.07040337100625038, | |
| "reward_std": 0.8260042667388916, | |
| "rewards/cosine_scaled_reward": -0.10021498240530491, | |
| "rewards/format_reward": 0.27083333767950535, | |
| "step": 75 | |
| }, | |
| { | |
| "completion_length": 2957.6459350585938, | |
| "epoch": 0.08685714285714285, | |
| "grad_norm": 0.12294893711805344, | |
| "kl": 0.0023813247680664062, | |
| "learning_rate": 9.926071618660237e-07, | |
| "loss": 0.0543, | |
| "reward": 0.28933531790971756, | |
| "reward_std": 0.7524442374706268, | |
| "rewards/cosine_scaled_reward": -0.04283232241868973, | |
| "rewards/format_reward": 0.3750000074505806, | |
| "step": 76 | |
| }, | |
| { | |
| "completion_length": 3163.9583740234375, | |
| "epoch": 0.088, | |
| "grad_norm": 0.09998784214258194, | |
| "kl": 0.003734588623046875, | |
| "learning_rate": 9.9202926282791e-07, | |
| "loss": 0.0151, | |
| "reward": 0.43463192135095596, | |
| "reward_std": 0.6034069135785103, | |
| "rewards/cosine_scaled_reward": 0.0506493030115962, | |
| "rewards/format_reward": 0.3333333432674408, | |
| "step": 77 | |
| }, | |
| { | |
| "completion_length": 2789.729217529297, | |
| "epoch": 0.08914285714285715, | |
| "grad_norm": 0.1028476133942604, | |
| "kl": 0.0034427642822265625, | |
| "learning_rate": 9.91429819907136e-07, | |
| "loss": 0.009, | |
| "reward": 0.49053217470645905, | |
| "reward_std": 0.671901747584343, | |
| "rewards/cosine_scaled_reward": 0.005682730115950108, | |
| "rewards/format_reward": 0.4791666716337204, | |
| "step": 78 | |
| }, | |
| { | |
| "completion_length": 3021.9584350585938, | |
| "epoch": 0.09028571428571429, | |
| "grad_norm": 0.14524304866790771, | |
| "kl": 0.002349853515625, | |
| "learning_rate": 9.908088623197048e-07, | |
| "loss": 0.0437, | |
| "reward": 0.18685297295451164, | |
| "reward_std": 0.82758379727602, | |
| "rewards/cosine_scaled_reward": -0.10449018701910973, | |
| "rewards/format_reward": 0.3958333469927311, | |
| "step": 79 | |
| }, | |
| { | |
| "completion_length": 3422.2916870117188, | |
| "epoch": 0.09142857142857143, | |
| "grad_norm": 0.35899317264556885, | |
| "kl": 0.0026226043701171875, | |
| "learning_rate": 9.901664203302124e-07, | |
| "loss": 0.0522, | |
| "reward": -0.14088810980319977, | |
| "reward_std": 0.6001620069146156, | |
| "rewards/cosine_scaled_reward": -0.14336072688456625, | |
| "rewards/format_reward": 0.1458333358168602, | |
| "step": 80 | |
| }, | |
| { | |
| "completion_length": 3328.1458740234375, | |
| "epoch": 0.09257142857142857, | |
| "grad_norm": 0.1413203924894333, | |
| "kl": 0.003086090087890625, | |
| "learning_rate": 9.895025252503755e-07, | |
| "loss": -0.0009, | |
| "reward": 0.20990341156721115, | |
| "reward_std": 0.7368708997964859, | |
| "rewards/cosine_scaled_reward": -0.04088162397965789, | |
| "rewards/format_reward": 0.29166668094694614, | |
| "step": 81 | |
| }, | |
| { | |
| "completion_length": 3138.6041870117188, | |
| "epoch": 0.09371428571428571, | |
| "grad_norm": 0.5630224943161011, | |
| "kl": 0.0060558319091796875, | |
| "learning_rate": 9.888172094375033e-07, | |
| "loss": 0.0801, | |
| "reward": 0.005654335021972656, | |
| "reward_std": 0.7520733773708344, | |
| "rewards/cosine_scaled_reward": -0.14300616830587387, | |
| "rewards/format_reward": 0.2916666716337204, | |
| "step": 82 | |
| }, | |
| { | |
| "completion_length": 3398.0833740234375, | |
| "epoch": 0.09485714285714286, | |
| "grad_norm": 0.09970960766077042, | |
| "kl": 0.0034198760986328125, | |
| "learning_rate": 9.881105062929221e-07, | |
| "loss": 0.0172, | |
| "reward": -0.2690254710614681, | |
| "reward_std": 0.6017113700509071, | |
| "rewards/cosine_scaled_reward": -0.2490960769355297, | |
| "rewards/format_reward": 0.22916667722165585, | |
| "step": 83 | |
| }, | |
| { | |
| "completion_length": 2666.2501220703125, | |
| "epoch": 0.096, | |
| "grad_norm": 0.2184879034757614, | |
| "kl": 0.002471923828125, | |
| "learning_rate": 9.873824502603459e-07, | |
| "loss": 0.0858, | |
| "reward": 1.2698333784937859, | |
| "reward_std": 1.1699798554182053, | |
| "rewards/cosine_scaled_reward": 0.3119999971240759, | |
| "rewards/format_reward": 0.6458333656191826, | |
| "step": 84 | |
| }, | |
| { | |
| "completion_length": 3092.9791870117188, | |
| "epoch": 0.09714285714285714, | |
| "grad_norm": 0.11792045831680298, | |
| "kl": 0.0024585723876953125, | |
| "learning_rate": 9.866330768241983e-07, | |
| "loss": 0.0172, | |
| "reward": 0.3196272477507591, | |
| "reward_std": 0.7417704239487648, | |
| "rewards/cosine_scaled_reward": -0.017269723117351532, | |
| "rewards/format_reward": 0.354166679084301, | |
| "step": 85 | |
| }, | |
| { | |
| "completion_length": 3099.604248046875, | |
| "epoch": 0.09828571428571428, | |
| "grad_norm": 0.12413817644119263, | |
| "kl": 0.004852294921875, | |
| "learning_rate": 9.85862422507884e-07, | |
| "loss": 0.0517, | |
| "reward": -0.07946242019534111, | |
| "reward_std": 0.5531802475452423, | |
| "rewards/cosine_scaled_reward": -0.1751478873193264, | |
| "rewards/format_reward": 0.2708333432674408, | |
| "step": 86 | |
| }, | |
| { | |
| "completion_length": 3024.354278564453, | |
| "epoch": 0.09942857142857142, | |
| "grad_norm": 0.12308648228645325, | |
| "kl": 0.006999969482421875, | |
| "learning_rate": 9.850705248720068e-07, | |
| "loss": 0.0426, | |
| "reward": 0.1297205686569214, | |
| "reward_std": 0.7171878144145012, | |
| "rewards/cosine_scaled_reward": -0.12263973196968436, | |
| "rewards/format_reward": 0.37500000186264515, | |
| "step": 87 | |
| }, | |
| { | |
| "completion_length": 2875.6875610351562, | |
| "epoch": 0.10057142857142858, | |
| "grad_norm": 0.1610432118177414, | |
| "kl": 0.014064788818359375, | |
| "learning_rate": 9.8425742251254e-07, | |
| "loss": 0.0297, | |
| "reward": 0.6831055271031801, | |
| "reward_std": 0.7087237983942032, | |
| "rewards/cosine_scaled_reward": 0.0811360776424408, | |
| "rewards/format_reward": 0.5208333432674408, | |
| "step": 88 | |
| }, | |
| { | |
| "completion_length": 3291.3959350585938, | |
| "epoch": 0.10171428571428572, | |
| "grad_norm": 0.14732913672924042, | |
| "kl": 0.004520416259765625, | |
| "learning_rate": 9.83423155058946e-07, | |
| "loss": 0.063, | |
| "reward": 0.3873383179306984, | |
| "reward_std": 0.9104212373495102, | |
| "rewards/cosine_scaled_reward": 0.0374191589653492, | |
| "rewards/format_reward": 0.3125000111758709, | |
| "step": 89 | |
| }, | |
| { | |
| "completion_length": 3100.7500610351562, | |
| "epoch": 0.10285714285714286, | |
| "grad_norm": 0.09902340173721313, | |
| "kl": 0.005191802978515625, | |
| "learning_rate": 9.825677631722435e-07, | |
| "loss": 0.0212, | |
| "reward": 0.2355214934796095, | |
| "reward_std": 0.5521544776856899, | |
| "rewards/cosine_scaled_reward": -0.03848925232887268, | |
| "rewards/format_reward": 0.3125000074505806, | |
| "step": 90 | |
| }, | |
| { | |
| "completion_length": 3321.3958740234375, | |
| "epoch": 0.104, | |
| "grad_norm": 0.11201111227273941, | |
| "kl": 0.0046215057373046875, | |
| "learning_rate": 9.816912885430258e-07, | |
| "loss": 0.0302, | |
| "reward": 0.06314115412533283, | |
| "reward_std": 0.6101053357124329, | |
| "rewards/cosine_scaled_reward": -0.1246794331818819, | |
| "rewards/format_reward": 0.31250000558793545, | |
| "step": 91 | |
| }, | |
| { | |
| "completion_length": 2866.9375610351562, | |
| "epoch": 0.10514285714285715, | |
| "grad_norm": 0.08195216953754425, | |
| "kl": 0.00637054443359375, | |
| "learning_rate": 9.807937738894303e-07, | |
| "loss": 0.0374, | |
| "reward": 0.2856922000646591, | |
| "reward_std": 0.6180723085999489, | |
| "rewards/cosine_scaled_reward": -0.09673722740262747, | |
| "rewards/format_reward": 0.4791666716337204, | |
| "step": 92 | |
| }, | |
| { | |
| "completion_length": 2626.8333740234375, | |
| "epoch": 0.10628571428571429, | |
| "grad_norm": 0.0848076120018959, | |
| "kl": 0.00502777099609375, | |
| "learning_rate": 9.798752629550546e-07, | |
| "loss": 0.0158, | |
| "reward": 0.47025431878864765, | |
| "reward_std": 0.5611053630709648, | |
| "rewards/cosine_scaled_reward": 0.005960509181022644, | |
| "rewards/format_reward": 0.4583333432674408, | |
| "step": 93 | |
| }, | |
| { | |
| "completion_length": 3384.666748046875, | |
| "epoch": 0.10742857142857143, | |
| "grad_norm": 0.11509731411933899, | |
| "kl": 0.005451202392578125, | |
| "learning_rate": 9.78935800506826e-07, | |
| "loss": 0.0334, | |
| "reward": 0.010346372611820698, | |
| "reward_std": 0.6185438930988312, | |
| "rewards/cosine_scaled_reward": -0.09899348951876163, | |
| "rewards/format_reward": 0.2083333358168602, | |
| "step": 94 | |
| }, | |
| { | |
| "completion_length": 3308.729248046875, | |
| "epoch": 0.10857142857142857, | |
| "grad_norm": 0.13493004441261292, | |
| "kl": 0.00511932373046875, | |
| "learning_rate": 9.779754323328192e-07, | |
| "loss": 0.0494, | |
| "reward": -0.04175245389342308, | |
| "reward_std": 0.819076806306839, | |
| "rewards/cosine_scaled_reward": -0.14587622694671154, | |
| "rewards/format_reward": 0.2500000111758709, | |
| "step": 95 | |
| }, | |
| { | |
| "completion_length": 2638.8333740234375, | |
| "epoch": 0.10971428571428571, | |
| "grad_norm": 0.1093597412109375, | |
| "kl": 0.006412506103515625, | |
| "learning_rate": 9.769942052400235e-07, | |
| "loss": 0.0613, | |
| "reward": 0.2132774479687214, | |
| "reward_std": 0.6241517812013626, | |
| "rewards/cosine_scaled_reward": -0.1121112871915102, | |
| "rewards/format_reward": 0.4375000149011612, | |
| "step": 96 | |
| }, | |
| { | |
| "completion_length": 3025.687530517578, | |
| "epoch": 0.11085714285714286, | |
| "grad_norm": 0.14619475603103638, | |
| "kl": 0.017696380615234375, | |
| "learning_rate": 9.759921670520634e-07, | |
| "loss": 0.0518, | |
| "reward": 0.21731913276016712, | |
| "reward_std": 0.8663276582956314, | |
| "rewards/cosine_scaled_reward": -0.058007098734378815, | |
| "rewards/format_reward": 0.33333334513008595, | |
| "step": 97 | |
| }, | |
| { | |
| "completion_length": 2998.0833740234375, | |
| "epoch": 0.112, | |
| "grad_norm": 0.08425849676132202, | |
| "kl": 0.011322021484375, | |
| "learning_rate": 9.749693666068663e-07, | |
| "loss": 0.0316, | |
| "reward": -0.0604003369808197, | |
| "reward_std": 0.4831971898674965, | |
| "rewards/cosine_scaled_reward": -0.17603351920843124, | |
| "rewards/format_reward": 0.2916666716337204, | |
| "step": 98 | |
| }, | |
| { | |
| "completion_length": 2882.8958435058594, | |
| "epoch": 0.11314285714285714, | |
| "grad_norm": 0.10733813792467117, | |
| "kl": 0.0042572021484375, | |
| "learning_rate": 9.739258537542835e-07, | |
| "loss": 0.0469, | |
| "reward": 0.37905219942331314, | |
| "reward_std": 0.6325190886855125, | |
| "rewards/cosine_scaled_reward": 0.012442763894796371, | |
| "rewards/format_reward": 0.35416667722165585, | |
| "step": 99 | |
| }, | |
| { | |
| "completion_length": 3077.979248046875, | |
| "epoch": 0.11428571428571428, | |
| "grad_norm": 0.22007572650909424, | |
| "kl": 0.00611114501953125, | |
| "learning_rate": 9.728616793536587e-07, | |
| "loss": 0.0684, | |
| "reward": 0.15971739403903484, | |
| "reward_std": 0.8245379701256752, | |
| "rewards/cosine_scaled_reward": -0.0972246453166008, | |
| "rewards/format_reward": 0.3541666716337204, | |
| "step": 100 | |
| }, | |
| { | |
| "completion_length": 3149.5000610351562, | |
| "epoch": 0.11542857142857142, | |
| "grad_norm": 0.17998212575912476, | |
| "kl": 0.0086212158203125, | |
| "learning_rate": 9.717768952713511e-07, | |
| "loss": 0.0924, | |
| "reward": -0.043516192585229874, | |
| "reward_std": 0.7394061759114265, | |
| "rewards/cosine_scaled_reward": -0.1467580944299698, | |
| "rewards/format_reward": 0.25000001303851604, | |
| "step": 101 | |
| }, | |
| { | |
| "completion_length": 2830.2500610351562, | |
| "epoch": 0.11657142857142858, | |
| "grad_norm": 0.10636850446462631, | |
| "kl": 0.006778717041015625, | |
| "learning_rate": 9.706715543782064e-07, | |
| "loss": 0.0118, | |
| "reward": 0.20193170942366123, | |
| "reward_std": 0.5816469639539719, | |
| "rewards/cosine_scaled_reward": -0.09695081505924463, | |
| "rewards/format_reward": 0.3958333358168602, | |
| "step": 102 | |
| }, | |
| { | |
| "completion_length": 3253.354248046875, | |
| "epoch": 0.11771428571428572, | |
| "grad_norm": 0.10601601004600525, | |
| "kl": 0.0059051513671875, | |
| "learning_rate": 9.695457105469804e-07, | |
| "loss": 0.0393, | |
| "reward": 0.16332483664155006, | |
| "reward_std": 0.7165435254573822, | |
| "rewards/cosine_scaled_reward": -0.07458756864070892, | |
| "rewards/format_reward": 0.3125000149011612, | |
| "step": 103 | |
| }, | |
| { | |
| "completion_length": 2784.0416870117188, | |
| "epoch": 0.11885714285714286, | |
| "grad_norm": 0.14525532722473145, | |
| "kl": 0.00762176513671875, | |
| "learning_rate": 9.683994186497132e-07, | |
| "loss": 0.0257, | |
| "reward": 0.6941813006997108, | |
| "reward_std": 0.731097511947155, | |
| "rewards/cosine_scaled_reward": 0.13875730894505978, | |
| "rewards/format_reward": 0.41666667722165585, | |
| "step": 104 | |
| }, | |
| { | |
| "completion_length": 3037.291748046875, | |
| "epoch": 0.12, | |
| "grad_norm": 0.10406464338302612, | |
| "kl": 0.0091552734375, | |
| "learning_rate": 9.672327345550543e-07, | |
| "loss": 0.0217, | |
| "reward": -0.03945709019899368, | |
| "reward_std": 0.5527790486812592, | |
| "rewards/cosine_scaled_reward": -0.14472855255007744, | |
| "rewards/format_reward": 0.25000000186264515, | |
| "step": 105 | |
| }, | |
| { | |
| "completion_length": 3007.6250610351562, | |
| "epoch": 0.12114285714285715, | |
| "grad_norm": 0.1392635703086853, | |
| "kl": 0.00736236572265625, | |
| "learning_rate": 9.66045715125541e-07, | |
| "loss": 0.0638, | |
| "reward": 0.2589884400367737, | |
| "reward_std": 0.8927985578775406, | |
| "rewards/cosine_scaled_reward": -0.05800577998161316, | |
| "rewards/format_reward": 0.3750000111758709, | |
| "step": 106 | |
| }, | |
| { | |
| "completion_length": 2705.52099609375, | |
| "epoch": 0.12228571428571429, | |
| "grad_norm": 0.19877693057060242, | |
| "kl": 0.00640869140625, | |
| "learning_rate": 9.648384182148252e-07, | |
| "loss": 0.0852, | |
| "reward": 0.42868572287261486, | |
| "reward_std": 0.7907231077551842, | |
| "rewards/cosine_scaled_reward": -0.025240465998649597, | |
| "rewards/format_reward": 0.4791666716337204, | |
| "step": 107 | |
| }, | |
| { | |
| "completion_length": 2601.9793090820312, | |
| "epoch": 0.12342857142857143, | |
| "grad_norm": 0.1907849907875061, | |
| "kl": 0.010498046875, | |
| "learning_rate": 9.636109026648554e-07, | |
| "loss": 0.0862, | |
| "reward": 1.0781057141721249, | |
| "reward_std": 0.926390677690506, | |
| "rewards/cosine_scaled_reward": 0.2578028216958046, | |
| "rewards/format_reward": 0.5625, | |
| "step": 108 | |
| }, | |
| { | |
| "completion_length": 2873.500030517578, | |
| "epoch": 0.12457142857142857, | |
| "grad_norm": 0.12728413939476013, | |
| "kl": 0.00748443603515625, | |
| "learning_rate": 9.623632283030077e-07, | |
| "loss": 0.0443, | |
| "reward": 0.2420949712395668, | |
| "reward_std": 0.6641058176755905, | |
| "rewards/cosine_scaled_reward": -0.0872858352959156, | |
| "rewards/format_reward": 0.416666679084301, | |
| "step": 109 | |
| }, | |
| { | |
| "completion_length": 2759.041748046875, | |
| "epoch": 0.12571428571428572, | |
| "grad_norm": 0.3926822543144226, | |
| "kl": 0.0103759765625, | |
| "learning_rate": 9.610954559391704e-07, | |
| "loss": 0.051, | |
| "reward": 0.7485219649970531, | |
| "reward_std": 1.0151629000902176, | |
| "rewards/cosine_scaled_reward": 0.07217762316577137, | |
| "rewards/format_reward": 0.6041666865348816, | |
| "step": 110 | |
| }, | |
| { | |
| "completion_length": 1981.291748046875, | |
| "epoch": 0.12685714285714286, | |
| "grad_norm": 0.13100939989089966, | |
| "kl": 0.0102081298828125, | |
| "learning_rate": 9.598076473627796e-07, | |
| "loss": -0.018, | |
| "reward": 0.8073812872171402, | |
| "reward_std": 0.8186813145875931, | |
| "rewards/cosine_scaled_reward": 0.028690634877420962, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 111 | |
| }, | |
| { | |
| "completion_length": 3106.9583740234375, | |
| "epoch": 0.128, | |
| "grad_norm": 0.18594208359718323, | |
| "kl": 0.01175689697265625, | |
| "learning_rate": 9.58499865339809e-07, | |
| "loss": 0.0913, | |
| "reward": 0.3422376364469528, | |
| "reward_std": 0.8253115490078926, | |
| "rewards/cosine_scaled_reward": -0.016381196677684784, | |
| "rewards/format_reward": 0.3750000223517418, | |
| "step": 112 | |
| }, | |
| { | |
| "completion_length": 2985.0208740234375, | |
| "epoch": 0.12914285714285714, | |
| "grad_norm": 0.10086725652217865, | |
| "kl": 0.0164794921875, | |
| "learning_rate": 9.571721736097088e-07, | |
| "loss": 0.026, | |
| "reward": 0.6304376311600208, | |
| "reward_std": 0.6578450873494148, | |
| "rewards/cosine_scaled_reward": 0.10688545554876328, | |
| "rewards/format_reward": 0.41666667722165585, | |
| "step": 113 | |
| }, | |
| { | |
| "completion_length": 2085.0416870117188, | |
| "epoch": 0.13028571428571428, | |
| "grad_norm": 0.1902545839548111, | |
| "kl": 0.0139923095703125, | |
| "learning_rate": 9.55824636882301e-07, | |
| "loss": 0.0715, | |
| "reward": 0.9513098001480103, | |
| "reward_std": 0.9133107215166092, | |
| "rewards/cosine_scaled_reward": 0.13190488796681166, | |
| "rewards/format_reward": 0.6875000149011612, | |
| "step": 114 | |
| }, | |
| { | |
| "completion_length": 3096.6250610351562, | |
| "epoch": 0.13142857142857142, | |
| "grad_norm": 0.1532527357339859, | |
| "kl": 0.01084136962890625, | |
| "learning_rate": 9.54457320834625e-07, | |
| "loss": 0.0594, | |
| "reward": 0.5540619897656143, | |
| "reward_std": 0.9744190573692322, | |
| "rewards/cosine_scaled_reward": 0.05828099511563778, | |
| "rewards/format_reward": 0.43750000558793545, | |
| "step": 115 | |
| }, | |
| { | |
| "completion_length": 3017.5833740234375, | |
| "epoch": 0.13257142857142856, | |
| "grad_norm": 0.1256159394979477, | |
| "kl": 0.012542724609375, | |
| "learning_rate": 9.530702921077358e-07, | |
| "loss": 0.0515, | |
| "reward": -0.0034197866916656494, | |
| "reward_std": 0.6141533181071281, | |
| "rewards/cosine_scaled_reward": -0.12670988403260708, | |
| "rewards/format_reward": 0.25, | |
| "step": 116 | |
| }, | |
| { | |
| "completion_length": 3360.916748046875, | |
| "epoch": 0.1337142857142857, | |
| "grad_norm": 0.13030223548412323, | |
| "kl": 0.010650634765625, | |
| "learning_rate": 9.516636183034564e-07, | |
| "loss": 0.0416, | |
| "reward": -0.01844558771699667, | |
| "reward_std": 0.7771024033427238, | |
| "rewards/cosine_scaled_reward": -0.13422280363738537, | |
| "rewards/format_reward": 0.2500000111758709, | |
| "step": 117 | |
| }, | |
| { | |
| "completion_length": 2493.2709350585938, | |
| "epoch": 0.13485714285714287, | |
| "grad_norm": 0.10445129871368408, | |
| "kl": 0.01708984375, | |
| "learning_rate": 9.502373679810839e-07, | |
| "loss": 0.045, | |
| "reward": 0.7345311008393764, | |
| "reward_std": 0.6608476266264915, | |
| "rewards/cosine_scaled_reward": 0.054765526205301285, | |
| "rewards/format_reward": 0.6250000055879354, | |
| "step": 118 | |
| }, | |
| { | |
| "completion_length": 2583.8334045410156, | |
| "epoch": 0.136, | |
| "grad_norm": 0.1751917004585266, | |
| "kl": 0.01385498046875, | |
| "learning_rate": 9.487916106540465e-07, | |
| "loss": 0.1151, | |
| "reward": 0.16756585985422134, | |
| "reward_std": 0.6609668508172035, | |
| "rewards/cosine_scaled_reward": -0.1558004072867334, | |
| "rewards/format_reward": 0.4791666939854622, | |
| "step": 119 | |
| }, | |
| { | |
| "completion_length": 3435.6458740234375, | |
| "epoch": 0.13714285714285715, | |
| "grad_norm": 0.20698896050453186, | |
| "kl": 0.01154327392578125, | |
| "learning_rate": 9.473264167865171e-07, | |
| "loss": 0.0604, | |
| "reward": -0.05263599753379822, | |
| "reward_std": 1.0508478283882141, | |
| "rewards/cosine_scaled_reward": -0.17215134110301733, | |
| "rewards/format_reward": 0.2916666716337204, | |
| "step": 120 | |
| }, | |
| { | |
| "completion_length": 3101.875, | |
| "epoch": 0.1382857142857143, | |
| "grad_norm": 0.10516638308763504, | |
| "kl": 0.012359619140625, | |
| "learning_rate": 9.458418577899774e-07, | |
| "loss": 0.0121, | |
| "reward": 0.045499179512262344, | |
| "reward_std": 0.5043403655290604, | |
| "rewards/cosine_scaled_reward": -0.10225043445825577, | |
| "rewards/format_reward": 0.2500000111758709, | |
| "step": 121 | |
| }, | |
| { | |
| "completion_length": 3044.3541870117188, | |
| "epoch": 0.13942857142857143, | |
| "grad_norm": 0.10074342042207718, | |
| "kl": 0.019744873046875, | |
| "learning_rate": 9.443380060197385e-07, | |
| "loss": 0.0442, | |
| "reward": 0.021381250582635403, | |
| "reward_std": 0.5577950775623322, | |
| "rewards/cosine_scaled_reward": -0.13514270819723606, | |
| "rewards/format_reward": 0.2916666716337204, | |
| "step": 122 | |
| }, | |
| { | |
| "completion_length": 2913.7083740234375, | |
| "epoch": 0.14057142857142857, | |
| "grad_norm": 0.14308768510818481, | |
| "kl": 0.0152587890625, | |
| "learning_rate": 9.428149347714143e-07, | |
| "loss": 0.0878, | |
| "reward": 0.12965750694274902, | |
| "reward_std": 0.736047625541687, | |
| "rewards/cosine_scaled_reward": -0.09142125025391579, | |
| "rewards/format_reward": 0.31250000558793545, | |
| "step": 123 | |
| }, | |
| { | |
| "completion_length": 2562.5000610351562, | |
| "epoch": 0.1417142857142857, | |
| "grad_norm": 0.19142040610313416, | |
| "kl": 0.01031494140625, | |
| "learning_rate": 9.412727182773486e-07, | |
| "loss": 0.065, | |
| "reward": 0.8353077471256256, | |
| "reward_std": 1.026055485010147, | |
| "rewards/cosine_scaled_reward": 0.12598720658570528, | |
| "rewards/format_reward": 0.5833333507180214, | |
| "step": 124 | |
| }, | |
| { | |
| "completion_length": 3017.6251220703125, | |
| "epoch": 0.14285714285714285, | |
| "grad_norm": 0.117274209856987, | |
| "kl": 0.009552001953125, | |
| "learning_rate": 9.397114317029974e-07, | |
| "loss": 0.0076, | |
| "reward": 0.1632972015067935, | |
| "reward_std": 0.5557524636387825, | |
| "rewards/cosine_scaled_reward": -0.10585140064358711, | |
| "rewards/format_reward": 0.37500000558793545, | |
| "step": 125 | |
| }, | |
| { | |
| "completion_length": 2858.8334350585938, | |
| "epoch": 0.144, | |
| "grad_norm": 0.2655041217803955, | |
| "kl": 0.01821136474609375, | |
| "learning_rate": 9.381311511432658e-07, | |
| "loss": 0.0873, | |
| "reward": 0.30082017183303833, | |
| "reward_std": 0.9569597989320755, | |
| "rewards/cosine_scaled_reward": -0.06833992386236787, | |
| "rewards/format_reward": 0.4375000074505806, | |
| "step": 126 | |
| }, | |
| { | |
| "completion_length": 2871.2083435058594, | |
| "epoch": 0.14514285714285713, | |
| "grad_norm": 0.0872960090637207, | |
| "kl": 0.0139007568359375, | |
| "learning_rate": 9.36531953618799e-07, | |
| "loss": -0.0046, | |
| "reward": 0.1562192291021347, | |
| "reward_std": 0.58997593075037, | |
| "rewards/cosine_scaled_reward": -0.10939039289951324, | |
| "rewards/format_reward": 0.37500000558793545, | |
| "step": 127 | |
| }, | |
| { | |
| "completion_length": 3199.2291870117188, | |
| "epoch": 0.1462857142857143, | |
| "grad_norm": 0.21217796206474304, | |
| "kl": 0.020172119140625, | |
| "learning_rate": 9.34913917072228e-07, | |
| "loss": 0.0995, | |
| "reward": 0.07891843095421791, | |
| "reward_std": 0.858635775744915, | |
| "rewards/cosine_scaled_reward": -0.10637411894276738, | |
| "rewards/format_reward": 0.2916666716337204, | |
| "step": 128 | |
| }, | |
| { | |
| "completion_length": 2658.1458435058594, | |
| "epoch": 0.14742857142857144, | |
| "grad_norm": 0.13081493973731995, | |
| "kl": 0.0191650390625, | |
| "learning_rate": 9.332771203643714e-07, | |
| "loss": 0.0418, | |
| "reward": 0.2748406231403351, | |
| "reward_std": 0.6719504073262215, | |
| "rewards/cosine_scaled_reward": -0.10216302564367652, | |
| "rewards/format_reward": 0.4791666753590107, | |
| "step": 129 | |
| }, | |
| { | |
| "completion_length": 3460.2291870117188, | |
| "epoch": 0.14857142857142858, | |
| "grad_norm": 0.12681667506694794, | |
| "kl": 0.01409912109375, | |
| "learning_rate": 9.316216432703916e-07, | |
| "loss": 0.0399, | |
| "reward": -0.11496437340974808, | |
| "reward_std": 0.6864899545907974, | |
| "rewards/cosine_scaled_reward": -0.15123217983637005, | |
| "rewards/format_reward": 0.18750000558793545, | |
| "step": 130 | |
| }, | |
| { | |
| "completion_length": 3362.3750610351562, | |
| "epoch": 0.14971428571428572, | |
| "grad_norm": 0.12439722567796707, | |
| "kl": 0.01568603515625, | |
| "learning_rate": 9.299475664759068e-07, | |
| "loss": 0.0316, | |
| "reward": 0.17270515114068985, | |
| "reward_std": 0.636282742023468, | |
| "rewards/cosine_scaled_reward": -0.01781410351395607, | |
| "rewards/format_reward": 0.20833334513008595, | |
| "step": 131 | |
| }, | |
| { | |
| "completion_length": 3433.3333740234375, | |
| "epoch": 0.15085714285714286, | |
| "grad_norm": 0.13320712745189667, | |
| "kl": 0.020172119140625, | |
| "learning_rate": 9.282549715730579e-07, | |
| "loss": 0.0099, | |
| "reward": -0.2513204962015152, | |
| "reward_std": 0.6501054912805557, | |
| "rewards/cosine_scaled_reward": -0.2298269160091877, | |
| "rewards/format_reward": 0.2083333432674408, | |
| "step": 132 | |
| }, | |
| { | |
| "completion_length": 3220.1459350585938, | |
| "epoch": 0.152, | |
| "grad_norm": 0.17302778363227844, | |
| "kl": 0.01995849609375, | |
| "learning_rate": 9.265439410565328e-07, | |
| "loss": 0.0398, | |
| "reward": -0.11768799647688866, | |
| "reward_std": 0.6951716169714928, | |
| "rewards/cosine_scaled_reward": -0.22551067918539047, | |
| "rewards/format_reward": 0.3333333544433117, | |
| "step": 133 | |
| }, | |
| { | |
| "completion_length": 2314.5416870117188, | |
| "epoch": 0.15314285714285714, | |
| "grad_norm": 0.0858488380908966, | |
| "kl": 0.025665283203125, | |
| "learning_rate": 9.248145583195447e-07, | |
| "loss": 0.0092, | |
| "reward": 0.602238692343235, | |
| "reward_std": 0.563841238617897, | |
| "rewards/cosine_scaled_reward": -0.011380670592188835, | |
| "rewards/format_reward": 0.625, | |
| "step": 134 | |
| }, | |
| { | |
| "completion_length": 2965.8750610351562, | |
| "epoch": 0.15428571428571428, | |
| "grad_norm": 0.17062057554721832, | |
| "kl": 0.019134521484375, | |
| "learning_rate": 9.230669076497687e-07, | |
| "loss": 0.045, | |
| "reward": 0.17675711959600449, | |
| "reward_std": 0.5801602862775326, | |
| "rewards/cosine_scaled_reward": -0.05745477043092251, | |
| "rewards/format_reward": 0.2916666679084301, | |
| "step": 135 | |
| }, | |
| { | |
| "completion_length": 2485.8334045410156, | |
| "epoch": 0.15542857142857142, | |
| "grad_norm": 0.13649305701255798, | |
| "kl": 0.021697998046875, | |
| "learning_rate": 9.213010742252327e-07, | |
| "loss": 0.0181, | |
| "reward": 0.6858363393694162, | |
| "reward_std": 0.8353622853755951, | |
| "rewards/cosine_scaled_reward": 0.009584830142557621, | |
| "rewards/format_reward": 0.6666666865348816, | |
| "step": 136 | |
| }, | |
| { | |
| "completion_length": 3372.6043090820312, | |
| "epoch": 0.15657142857142858, | |
| "grad_norm": 0.12744168937206268, | |
| "kl": 0.0316314697265625, | |
| "learning_rate": 9.195171441101668e-07, | |
| "loss": 0.0593, | |
| "reward": -0.08887681737542152, | |
| "reward_std": 0.6366704031825066, | |
| "rewards/cosine_scaled_reward": -0.1486050896346569, | |
| "rewards/format_reward": 0.2083333358168602, | |
| "step": 137 | |
| }, | |
| { | |
| "completion_length": 2583.1250915527344, | |
| "epoch": 0.15771428571428572, | |
| "grad_norm": 0.1180926188826561, | |
| "kl": 0.0181732177734375, | |
| "learning_rate": 9.177152042508077e-07, | |
| "loss": 0.035, | |
| "reward": 0.8022582903504372, | |
| "reward_std": 0.7210212647914886, | |
| "rewards/cosine_scaled_reward": 0.11987911909818649, | |
| "rewards/format_reward": 0.5625000111758709, | |
| "step": 138 | |
| }, | |
| { | |
| "completion_length": 2719.5208435058594, | |
| "epoch": 0.15885714285714286, | |
| "grad_norm": 0.13920994102954865, | |
| "kl": 0.0205841064453125, | |
| "learning_rate": 9.158953424711624e-07, | |
| "loss": 0.0124, | |
| "reward": 0.43635744601488113, | |
| "reward_std": 0.7499766424298286, | |
| "rewards/cosine_scaled_reward": -0.042237947694957256, | |
| "rewards/format_reward": 0.520833345130086, | |
| "step": 139 | |
| }, | |
| { | |
| "completion_length": 2655.6251220703125, | |
| "epoch": 0.16, | |
| "grad_norm": 0.12660294771194458, | |
| "kl": 0.01995849609375, | |
| "learning_rate": 9.140576474687263e-07, | |
| "loss": 0.0166, | |
| "reward": 0.6822620648890734, | |
| "reward_std": 0.6412546709179878, | |
| "rewards/cosine_scaled_reward": 0.049464356154203415, | |
| "rewards/format_reward": 0.5833333432674408, | |
| "step": 140 | |
| }, | |
| { | |
| "completion_length": 2931.5208740234375, | |
| "epoch": 0.16114285714285714, | |
| "grad_norm": 0.21838468313217163, | |
| "kl": 0.023284912109375, | |
| "learning_rate": 9.122022088101613e-07, | |
| "loss": 0.0897, | |
| "reward": 0.5501389801502228, | |
| "reward_std": 0.931708961725235, | |
| "rewards/cosine_scaled_reward": 0.05631948262453079, | |
| "rewards/format_reward": 0.4375000074505806, | |
| "step": 141 | |
| }, | |
| { | |
| "completion_length": 2731.5209350585938, | |
| "epoch": 0.16228571428571428, | |
| "grad_norm": 0.1206783875823021, | |
| "kl": 0.0212860107421875, | |
| "learning_rate": 9.103291169269299e-07, | |
| "loss": 0.0285, | |
| "reward": 0.44770222902297974, | |
| "reward_std": 0.6320216841995716, | |
| "rewards/cosine_scaled_reward": 0.01551777683198452, | |
| "rewards/format_reward": 0.41666667722165585, | |
| "step": 142 | |
| }, | |
| { | |
| "completion_length": 3003.0833740234375, | |
| "epoch": 0.16342857142857142, | |
| "grad_norm": 0.1385820508003235, | |
| "kl": 0.0207366943359375, | |
| "learning_rate": 9.084384631108882e-07, | |
| "loss": 0.0375, | |
| "reward": 0.36010952293872833, | |
| "reward_std": 0.6810671910643578, | |
| "rewards/cosine_scaled_reward": -0.007445234805345535, | |
| "rewards/format_reward": 0.37500001676380634, | |
| "step": 143 | |
| }, | |
| { | |
| "completion_length": 3069.2084350585938, | |
| "epoch": 0.16457142857142856, | |
| "grad_norm": 0.24886491894721985, | |
| "kl": 0.023193359375, | |
| "learning_rate": 9.065303395098358e-07, | |
| "loss": 0.0784, | |
| "reward": 0.5813055820763111, | |
| "reward_std": 1.03695610165596, | |
| "rewards/cosine_scaled_reward": 0.030236128717660904, | |
| "rewards/format_reward": 0.520833358168602, | |
| "step": 144 | |
| }, | |
| { | |
| "completion_length": 3248.541748046875, | |
| "epoch": 0.1657142857142857, | |
| "grad_norm": 0.27944961190223694, | |
| "kl": 0.026947021484375, | |
| "learning_rate": 9.046048391230247e-07, | |
| "loss": 0.0821, | |
| "reward": 0.31095648277550936, | |
| "reward_std": 1.044460952281952, | |
| "rewards/cosine_scaled_reward": -0.0007717590779066086, | |
| "rewards/format_reward": 0.3125000037252903, | |
| "step": 145 | |
| }, | |
| { | |
| "completion_length": 3391.2918090820312, | |
| "epoch": 0.16685714285714287, | |
| "grad_norm": 0.1663837432861328, | |
| "kl": 0.033966064453125, | |
| "learning_rate": 9.026620557966279e-07, | |
| "loss": 0.0146, | |
| "reward": 0.22623740322887897, | |
| "reward_std": 0.7760383784770966, | |
| "rewards/cosine_scaled_reward": -0.10563132539391518, | |
| "rewards/format_reward": 0.4375000111758709, | |
| "step": 146 | |
| }, | |
| { | |
| "completion_length": 2764.3750610351562, | |
| "epoch": 0.168, | |
| "grad_norm": 0.15888190269470215, | |
| "kl": 0.0340576171875, | |
| "learning_rate": 9.007020842191634e-07, | |
| "loss": 0.02, | |
| "reward": 0.43453994020819664, | |
| "reward_std": 0.6980537474155426, | |
| "rewards/cosine_scaled_reward": -0.011896707117557526, | |
| "rewards/format_reward": 0.4583333395421505, | |
| "step": 147 | |
| }, | |
| { | |
| "completion_length": 3387.8125610351562, | |
| "epoch": 0.16914285714285715, | |
| "grad_norm": 0.1351691633462906, | |
| "kl": 0.0419921875, | |
| "learning_rate": 8.987250199168808e-07, | |
| "loss": 0.0369, | |
| "reward": -0.3159765365999192, | |
| "reward_std": 0.5913000628352165, | |
| "rewards/cosine_scaled_reward": -0.2517382688820362, | |
| "rewards/format_reward": 0.18750000558793545, | |
| "step": 148 | |
| }, | |
| { | |
| "completion_length": 3070.1458740234375, | |
| "epoch": 0.1702857142857143, | |
| "grad_norm": 0.13587744534015656, | |
| "kl": 0.0269927978515625, | |
| "learning_rate": 8.967309592491052e-07, | |
| "loss": 0.0154, | |
| "reward": 0.8058477342128754, | |
| "reward_std": 0.6222796887159348, | |
| "rewards/cosine_scaled_reward": 0.17375719547271729, | |
| "rewards/format_reward": 0.4583333432674408, | |
| "step": 149 | |
| }, | |
| { | |
| "completion_length": 2798.2084350585938, | |
| "epoch": 0.17142857142857143, | |
| "grad_norm": 0.15157835185527802, | |
| "kl": 0.03900146484375, | |
| "learning_rate": 8.9471999940354e-07, | |
| "loss": 0.038, | |
| "reward": 0.05908125883433968, | |
| "reward_std": 0.7354179471731186, | |
| "rewards/cosine_scaled_reward": -0.21004271879792213, | |
| "rewards/format_reward": 0.4791666679084301, | |
| "step": 150 | |
| }, | |
| { | |
| "completion_length": 2979.8541870117188, | |
| "epoch": 0.17257142857142857, | |
| "grad_norm": 0.08188746124505997, | |
| "kl": 0.028045654296875, | |
| "learning_rate": 8.926922383915315e-07, | |
| "loss": -0.004, | |
| "reward": -0.011732706800103188, | |
| "reward_std": 0.44251058250665665, | |
| "rewards/cosine_scaled_reward": -0.1621163571253419, | |
| "rewards/format_reward": 0.3125, | |
| "step": 151 | |
| }, | |
| { | |
| "completion_length": 3252.1458740234375, | |
| "epoch": 0.1737142857142857, | |
| "grad_norm": 0.17107248306274414, | |
| "kl": 0.039520263671875, | |
| "learning_rate": 8.906477750432903e-07, | |
| "loss": 0.0199, | |
| "reward": 0.2528679259121418, | |
| "reward_std": 0.782855249941349, | |
| "rewards/cosine_scaled_reward": 0.0014339573681354523, | |
| "rewards/format_reward": 0.2500000111758709, | |
| "step": 152 | |
| }, | |
| { | |
| "completion_length": 2536.1458740234375, | |
| "epoch": 0.17485714285714285, | |
| "grad_norm": 0.13167858123779297, | |
| "kl": 0.02642822265625, | |
| "learning_rate": 8.88586709003076e-07, | |
| "loss": 0.0437, | |
| "reward": 0.8370774015784264, | |
| "reward_std": 0.7839193791151047, | |
| "rewards/cosine_scaled_reward": 0.08520536310970783, | |
| "rewards/format_reward": 0.6666666865348816, | |
| "step": 153 | |
| }, | |
| { | |
| "completion_length": 2881.3125610351562, | |
| "epoch": 0.176, | |
| "grad_norm": 0.20085100829601288, | |
| "kl": 0.0284423828125, | |
| "learning_rate": 8.865091407243394e-07, | |
| "loss": 0.0533, | |
| "reward": 0.6227563321590424, | |
| "reward_std": 0.8027107864618301, | |
| "rewards/cosine_scaled_reward": 0.0717947967350483, | |
| "rewards/format_reward": 0.4791666828095913, | |
| "step": 154 | |
| }, | |
| { | |
| "completion_length": 3419.541748046875, | |
| "epoch": 0.17714285714285713, | |
| "grad_norm": 0.1839601695537567, | |
| "kl": 0.03076171875, | |
| "learning_rate": 8.844151714648274e-07, | |
| "loss": -0.0021, | |
| "reward": 0.12460730504244566, | |
| "reward_std": 0.942700669169426, | |
| "rewards/cosine_scaled_reward": -0.07311302423477173, | |
| "rewards/format_reward": 0.2708333432674408, | |
| "step": 155 | |
| }, | |
| { | |
| "completion_length": 3294.9791870117188, | |
| "epoch": 0.1782857142857143, | |
| "grad_norm": 0.15247705578804016, | |
| "kl": 0.03961181640625, | |
| "learning_rate": 8.823049032816478e-07, | |
| "loss": 0.0569, | |
| "reward": -0.32880749367177486, | |
| "reward_std": 0.5324635952711105, | |
| "rewards/cosine_scaled_reward": -0.2685704119503498, | |
| "rewards/format_reward": 0.20833334140479565, | |
| "step": 156 | |
| }, | |
| { | |
| "completion_length": 2784.7916870117188, | |
| "epoch": 0.17942857142857144, | |
| "grad_norm": 0.29496413469314575, | |
| "kl": 0.0323944091796875, | |
| "learning_rate": 8.801784390262943e-07, | |
| "loss": 0.0628, | |
| "reward": 0.37049394473433495, | |
| "reward_std": 1.1466023474931717, | |
| "rewards/cosine_scaled_reward": -0.07516971230506897, | |
| "rewards/format_reward": 0.5208333432674408, | |
| "step": 157 | |
| }, | |
| { | |
| "completion_length": 3182.6875610351562, | |
| "epoch": 0.18057142857142858, | |
| "grad_norm": 0.1029396653175354, | |
| "kl": 0.04150390625, | |
| "learning_rate": 8.780358823396352e-07, | |
| "loss": 0.024, | |
| "reward": -0.27919139340519905, | |
| "reward_std": 0.5330808311700821, | |
| "rewards/cosine_scaled_reward": -0.2541790306568146, | |
| "rewards/format_reward": 0.22916666977107525, | |
| "step": 158 | |
| }, | |
| { | |
| "completion_length": 2943.6250610351562, | |
| "epoch": 0.18171428571428572, | |
| "grad_norm": 0.4807628393173218, | |
| "kl": 0.05224609375, | |
| "learning_rate": 8.758773376468604e-07, | |
| "loss": 0.136, | |
| "reward": 0.4126173257827759, | |
| "reward_std": 0.9520216137170792, | |
| "rewards/cosine_scaled_reward": -0.012441340368241072, | |
| "rewards/format_reward": 0.4375000149011612, | |
| "step": 159 | |
| }, | |
| { | |
| "completion_length": 2988.7291870117188, | |
| "epoch": 0.18285714285714286, | |
| "grad_norm": 0.1530563086271286, | |
| "kl": 0.041748046875, | |
| "learning_rate": 8.737029101523929e-07, | |
| "loss": 0.0217, | |
| "reward": 0.5735020600259304, | |
| "reward_std": 0.8126933425664902, | |
| "rewards/cosine_scaled_reward": 0.03675099462270737, | |
| "rewards/format_reward": 0.5000000149011612, | |
| "step": 160 | |
| }, | |
| { | |
| "completion_length": 3521.916748046875, | |
| "epoch": 0.184, | |
| "grad_norm": 0.18781894445419312, | |
| "kl": 0.04571533203125, | |
| "learning_rate": 8.715127058347614e-07, | |
| "loss": 0.0335, | |
| "reward": -0.04885682836174965, | |
| "reward_std": 0.8325313180685043, | |
| "rewards/cosine_scaled_reward": -0.12859507277607918, | |
| "rewards/format_reward": 0.2083333395421505, | |
| "step": 161 | |
| }, | |
| { | |
| "completion_length": 3144.6875, | |
| "epoch": 0.18514285714285714, | |
| "grad_norm": 0.15695880353450775, | |
| "kl": 0.0509033203125, | |
| "learning_rate": 8.693068314414344e-07, | |
| "loss": 0.033, | |
| "reward": 0.5102378875017166, | |
| "reward_std": 0.7466369420289993, | |
| "rewards/cosine_scaled_reward": -0.02613106439821422, | |
| "rewards/format_reward": 0.5625000111758709, | |
| "step": 162 | |
| }, | |
| { | |
| "completion_length": 2595.2083435058594, | |
| "epoch": 0.18628571428571428, | |
| "grad_norm": 0.25721773505210876, | |
| "kl": 0.056854248046875, | |
| "learning_rate": 8.670853944836176e-07, | |
| "loss": -0.002, | |
| "reward": 0.3704167567193508, | |
| "reward_std": 0.6248408891260624, | |
| "rewards/cosine_scaled_reward": -0.06479163467884064, | |
| "rewards/format_reward": 0.5000000055879354, | |
| "step": 163 | |
| }, | |
| { | |
| "completion_length": 3132.6458740234375, | |
| "epoch": 0.18742857142857142, | |
| "grad_norm": 0.3121108412742615, | |
| "kl": 0.066162109375, | |
| "learning_rate": 8.648485032310144e-07, | |
| "loss": 0.0436, | |
| "reward": -0.15848805382847786, | |
| "reward_std": 0.6006623804569244, | |
| "rewards/cosine_scaled_reward": -0.21466069296002388, | |
| "rewards/format_reward": 0.27083334140479565, | |
| "step": 164 | |
| }, | |
| { | |
| "completion_length": 3106.7918090820312, | |
| "epoch": 0.18857142857142858, | |
| "grad_norm": 0.36176592111587524, | |
| "kl": 0.059814453125, | |
| "learning_rate": 8.625962667065487e-07, | |
| "loss": 0.0527, | |
| "reward": 0.523316752165556, | |
| "reward_std": 1.1928484439849854, | |
| "rewards/cosine_scaled_reward": 0.022075051441788673, | |
| "rewards/format_reward": 0.4791666865348816, | |
| "step": 165 | |
| }, | |
| { | |
| "completion_length": 2909.6458435058594, | |
| "epoch": 0.18971428571428572, | |
| "grad_norm": 0.09995611011981964, | |
| "kl": 0.0648193359375, | |
| "learning_rate": 8.603287946810513e-07, | |
| "loss": 0.0269, | |
| "reward": 0.30894866585731506, | |
| "reward_std": 0.543118342757225, | |
| "rewards/cosine_scaled_reward": -0.022608992643654346, | |
| "rewards/format_reward": 0.35416666977107525, | |
| "step": 166 | |
| }, | |
| { | |
| "completion_length": 3123.3750610351562, | |
| "epoch": 0.19085714285714286, | |
| "grad_norm": 0.19064339995384216, | |
| "kl": 0.065673828125, | |
| "learning_rate": 8.580461976679099e-07, | |
| "loss": 0.0316, | |
| "reward": -0.04249940067529678, | |
| "reward_std": 0.6464731246232986, | |
| "rewards/cosine_scaled_reward": -0.2087497040629387, | |
| "rewards/format_reward": 0.3750000111758709, | |
| "step": 167 | |
| }, | |
| { | |
| "completion_length": 2451.8958740234375, | |
| "epoch": 0.192, | |
| "grad_norm": 0.2923497259616852, | |
| "kl": 0.06597900390625, | |
| "learning_rate": 8.557485869176825e-07, | |
| "loss": 0.0505, | |
| "reward": 0.5589314834214747, | |
| "reward_std": 0.7117247879505157, | |
| "rewards/cosine_scaled_reward": -0.03303426876664162, | |
| "rewards/format_reward": 0.6250000149011612, | |
| "step": 168 | |
| }, | |
| { | |
| "completion_length": 2313.9375610351562, | |
| "epoch": 0.19314285714285714, | |
| "grad_norm": 0.22615736722946167, | |
| "kl": 0.0550537109375, | |
| "learning_rate": 8.534360744126753e-07, | |
| "loss": 0.0106, | |
| "reward": 0.9692112673074007, | |
| "reward_std": 0.9811852872371674, | |
| "rewards/cosine_scaled_reward": 0.1616889564320445, | |
| "rewards/format_reward": 0.6458333507180214, | |
| "step": 169 | |
| }, | |
| { | |
| "completion_length": 2736.4375610351562, | |
| "epoch": 0.19428571428571428, | |
| "grad_norm": 0.17036058008670807, | |
| "kl": 0.08740234375, | |
| "learning_rate": 8.511087728614862e-07, | |
| "loss": 0.0282, | |
| "reward": 0.17224126309156418, | |
| "reward_std": 0.5562086030840874, | |
| "rewards/cosine_scaled_reward": -0.09096270857844502, | |
| "rewards/format_reward": 0.35416666977107525, | |
| "step": 170 | |
| }, | |
| { | |
| "completion_length": 2596.1458740234375, | |
| "epoch": 0.19542857142857142, | |
| "grad_norm": 0.2933753728866577, | |
| "kl": 0.1033935546875, | |
| "learning_rate": 8.487667956935087e-07, | |
| "loss": -0.0277, | |
| "reward": 0.3066958854906261, | |
| "reward_std": 1.1108788549900055, | |
| "rewards/cosine_scaled_reward": 0.0283479536883533, | |
| "rewards/format_reward": 0.2500000074505806, | |
| "step": 171 | |
| }, | |
| { | |
| "completion_length": 2705.1251220703125, | |
| "epoch": 0.19657142857142856, | |
| "grad_norm": 0.18700729310512543, | |
| "kl": 0.1024169921875, | |
| "learning_rate": 8.464102570534061e-07, | |
| "loss": 0.0173, | |
| "reward": 0.4773051217198372, | |
| "reward_std": 0.8035851642489433, | |
| "rewards/cosine_scaled_reward": -0.032180776819586754, | |
| "rewards/format_reward": 0.5416666828095913, | |
| "step": 172 | |
| }, | |
| { | |
| "completion_length": 2499.479248046875, | |
| "epoch": 0.1977142857142857, | |
| "grad_norm": 0.5721752643585205, | |
| "kl": 0.1807861328125, | |
| "learning_rate": 8.440392717955475e-07, | |
| "loss": 0.0732, | |
| "reward": 0.582635186612606, | |
| "reward_std": 0.9862835854291916, | |
| "rewards/cosine_scaled_reward": -0.010765749961137772, | |
| "rewards/format_reward": 0.6041666865348816, | |
| "step": 173 | |
| }, | |
| { | |
| "completion_length": 3085.5000610351562, | |
| "epoch": 0.19885714285714284, | |
| "grad_norm": 0.20046721398830414, | |
| "kl": 0.103759765625, | |
| "learning_rate": 8.416539554784089e-07, | |
| "loss": 0.0112, | |
| "reward": 0.35188272781670094, | |
| "reward_std": 0.5055751278996468, | |
| "rewards/cosine_scaled_reward": -0.032391976565122604, | |
| "rewards/format_reward": 0.41666667722165585, | |
| "step": 174 | |
| }, | |
| { | |
| "completion_length": 2600.9584350585938, | |
| "epoch": 0.2, | |
| "grad_norm": 0.27946504950523376, | |
| "kl": 0.1097412109375, | |
| "learning_rate": 8.392544243589427e-07, | |
| "loss": 0.0436, | |
| "reward": 0.622465105727315, | |
| "reward_std": 0.4762147720903158, | |
| "rewards/cosine_scaled_reward": 0.0716492049396038, | |
| "rewards/format_reward": 0.4791666679084301, | |
| "step": 175 | |
| }, | |
| { | |
| "completion_length": 2800.7084350585938, | |
| "epoch": 0.20114285714285715, | |
| "grad_norm": 0.27955377101898193, | |
| "kl": 0.1202392578125, | |
| "learning_rate": 8.368407953869103e-07, | |
| "loss": 0.0389, | |
| "reward": 0.4877171404659748, | |
| "reward_std": 0.9056157171726227, | |
| "rewards/cosine_scaled_reward": -0.006141431163996458, | |
| "rewards/format_reward": 0.5000000149011612, | |
| "step": 176 | |
| }, | |
| { | |
| "completion_length": 2680.6459350585938, | |
| "epoch": 0.2022857142857143, | |
| "grad_norm": 0.2823414206504822, | |
| "kl": 0.1099853515625, | |
| "learning_rate": 8.344131861991828e-07, | |
| "loss": 0.0552, | |
| "reward": 0.033542659133672714, | |
| "reward_std": 0.5713647753000259, | |
| "rewards/cosine_scaled_reward": -0.1603120118379593, | |
| "rewards/format_reward": 0.3541666679084301, | |
| "step": 177 | |
| }, | |
| { | |
| "completion_length": 2046.5625305175781, | |
| "epoch": 0.20342857142857143, | |
| "grad_norm": 0.20538190007209778, | |
| "kl": 0.1141357421875, | |
| "learning_rate": 8.319717151140072e-07, | |
| "loss": 0.0464, | |
| "reward": 0.728565389290452, | |
| "reward_std": 0.6446417346596718, | |
| "rewards/cosine_scaled_reward": 0.07261601462960243, | |
| "rewards/format_reward": 0.583333358168602, | |
| "step": 178 | |
| }, | |
| { | |
| "completion_length": 2761.5000610351562, | |
| "epoch": 0.20457142857142857, | |
| "grad_norm": 0.40644508600234985, | |
| "kl": 0.1458740234375, | |
| "learning_rate": 8.295165011252396e-07, | |
| "loss": 0.0513, | |
| "reward": 0.404015829320997, | |
| "reward_std": 0.853428527712822, | |
| "rewards/cosine_scaled_reward": -0.03757544606924057, | |
| "rewards/format_reward": 0.4791666828095913, | |
| "step": 179 | |
| }, | |
| { | |
| "completion_length": 2943.8958740234375, | |
| "epoch": 0.2057142857142857, | |
| "grad_norm": 0.5542572736740112, | |
| "kl": 0.16357421875, | |
| "learning_rate": 8.270476638965461e-07, | |
| "loss": 0.0718, | |
| "reward": 0.4506250247359276, | |
| "reward_std": 0.7890695706009865, | |
| "rewards/cosine_scaled_reward": -0.045520816929638386, | |
| "rewards/format_reward": 0.5416666716337204, | |
| "step": 180 | |
| }, | |
| { | |
| "completion_length": 3004.7709045410156, | |
| "epoch": 0.20685714285714285, | |
| "grad_norm": 0.2843971252441406, | |
| "kl": 0.154541015625, | |
| "learning_rate": 8.245653237555705e-07, | |
| "loss": 0.0082, | |
| "reward": 0.4879231466911733, | |
| "reward_std": 0.9720990136265755, | |
| "rewards/cosine_scaled_reward": 0.014794901013374329, | |
| "rewards/format_reward": 0.4583333358168602, | |
| "step": 181 | |
| }, | |
| { | |
| "completion_length": 2924.9375610351562, | |
| "epoch": 0.208, | |
| "grad_norm": 0.5410143136978149, | |
| "kl": 0.20361328125, | |
| "learning_rate": 8.220696016880687e-07, | |
| "loss": 0.0912, | |
| "reward": 0.005998063832521439, | |
| "reward_std": 0.7128682732582092, | |
| "rewards/cosine_scaled_reward": -0.18450098019093275, | |
| "rewards/format_reward": 0.3750000111758709, | |
| "step": 182 | |
| }, | |
| { | |
| "completion_length": 2788.166748046875, | |
| "epoch": 0.20914285714285713, | |
| "grad_norm": 0.33847859501838684, | |
| "kl": 0.1676025390625, | |
| "learning_rate": 8.195606193320136e-07, | |
| "loss": 0.0478, | |
| "reward": -0.0011163651943206787, | |
| "reward_std": 0.5493139624595642, | |
| "rewards/cosine_scaled_reward": -0.24014152213931084, | |
| "rewards/format_reward": 0.479166679084301, | |
| "step": 183 | |
| }, | |
| { | |
| "completion_length": 2438.3541870117188, | |
| "epoch": 0.2102857142857143, | |
| "grad_norm": 0.5205087065696716, | |
| "kl": 0.181884765625, | |
| "learning_rate": 8.170384989716657e-07, | |
| "loss": -0.002, | |
| "reward": 0.896189346909523, | |
| "reward_std": 1.161486804485321, | |
| "rewards/cosine_scaled_reward": 0.1460113013163209, | |
| "rewards/format_reward": 0.6041666865348816, | |
| "step": 184 | |
| }, | |
| { | |
| "completion_length": 2972.6250610351562, | |
| "epoch": 0.21142857142857144, | |
| "grad_norm": 0.5775122046470642, | |
| "kl": 0.25244140625, | |
| "learning_rate": 8.145033635316128e-07, | |
| "loss": 0.0701, | |
| "reward": 0.1199110560119152, | |
| "reward_std": 0.8271754533052444, | |
| "rewards/cosine_scaled_reward": -0.11712781526148319, | |
| "rewards/format_reward": 0.3541666679084301, | |
| "step": 185 | |
| }, | |
| { | |
| "completion_length": 2840.1875610351562, | |
| "epoch": 0.21257142857142858, | |
| "grad_norm": 0.3676423728466034, | |
| "kl": 0.2158203125, | |
| "learning_rate": 8.119553365707802e-07, | |
| "loss": 0.059, | |
| "reward": 0.5942272543907166, | |
| "reward_std": 0.7698107957839966, | |
| "rewards/cosine_scaled_reward": 0.057530272752046585, | |
| "rewards/format_reward": 0.4791666716337204, | |
| "step": 186 | |
| }, | |
| { | |
| "completion_length": 2704.8333740234375, | |
| "epoch": 0.21371428571428572, | |
| "grad_norm": 0.3322462737560272, | |
| "kl": 0.2138671875, | |
| "learning_rate": 8.093945422764069e-07, | |
| "loss": 0.0419, | |
| "reward": 0.4956296235322952, | |
| "reward_std": 0.7072524651885033, | |
| "rewards/cosine_scaled_reward": 0.10198147594928741, | |
| "rewards/format_reward": 0.29166668094694614, | |
| "step": 187 | |
| }, | |
| { | |
| "completion_length": 2780.729248046875, | |
| "epoch": 0.21485714285714286, | |
| "grad_norm": 0.3984168469905853, | |
| "kl": 0.288330078125, | |
| "learning_rate": 8.068211054579943e-07, | |
| "loss": 0.0474, | |
| "reward": 0.5863161403685808, | |
| "reward_std": 0.9082886129617691, | |
| "rewards/cosine_scaled_reward": -0.008925255388021469, | |
| "rewards/format_reward": 0.6041666865348816, | |
| "step": 188 | |
| }, | |
| { | |
| "completion_length": 2758.1459350585938, | |
| "epoch": 0.216, | |
| "grad_norm": 0.3293847143650055, | |
| "kl": 0.314453125, | |
| "learning_rate": 8.04235151541222e-07, | |
| "loss": 0.0399, | |
| "reward": 0.07589801587164402, | |
| "reward_std": 0.6460907310247421, | |
| "rewards/cosine_scaled_reward": -0.15996766556054354, | |
| "rewards/format_reward": 0.3958333358168602, | |
| "step": 189 | |
| }, | |
| { | |
| "completion_length": 3006.666748046875, | |
| "epoch": 0.21714285714285714, | |
| "grad_norm": 0.3320949971675873, | |
| "kl": 0.3447265625, | |
| "learning_rate": 8.01636806561836e-07, | |
| "loss": 0.0357, | |
| "reward": -0.11713236942887306, | |
| "reward_std": 0.6270528212189674, | |
| "rewards/cosine_scaled_reward": -0.20439952798187733, | |
| "rewards/format_reward": 0.29166667349636555, | |
| "step": 190 | |
| }, | |
| { | |
| "completion_length": 2770.479248046875, | |
| "epoch": 0.21828571428571428, | |
| "grad_norm": 0.7219541668891907, | |
| "kl": 0.30859375, | |
| "learning_rate": 7.990261971595048e-07, | |
| "loss": 0.084, | |
| "reward": 0.37447334453463554, | |
| "reward_std": 0.9116730242967606, | |
| "rewards/cosine_scaled_reward": -0.0002633389085531235, | |
| "rewards/format_reward": 0.3750000149011612, | |
| "step": 191 | |
| }, | |
| { | |
| "completion_length": 2974.6875610351562, | |
| "epoch": 0.21942857142857142, | |
| "grad_norm": 0.44086411595344543, | |
| "kl": 0.38525390625, | |
| "learning_rate": 7.964034505716476e-07, | |
| "loss": 0.0333, | |
| "reward": 0.2543896287679672, | |
| "reward_std": 0.9647316783666611, | |
| "rewards/cosine_scaled_reward": -0.06030518375337124, | |
| "rewards/format_reward": 0.37500001303851604, | |
| "step": 192 | |
| }, | |
| { | |
| "completion_length": 2625.5209350585938, | |
| "epoch": 0.22057142857142858, | |
| "grad_norm": 0.439861536026001, | |
| "kl": 0.33935546875, | |
| "learning_rate": 7.93768694627233e-07, | |
| "loss": 0.0154, | |
| "reward": -0.05579917132854462, | |
| "reward_std": 0.552303358912468, | |
| "rewards/cosine_scaled_reward": -0.19456627347972244, | |
| "rewards/format_reward": 0.33333334140479565, | |
| "step": 193 | |
| }, | |
| { | |
| "completion_length": 1735.7917175292969, | |
| "epoch": 0.22171428571428572, | |
| "grad_norm": 0.3492659032344818, | |
| "kl": 0.223876953125, | |
| "learning_rate": 7.911220577405484e-07, | |
| "loss": 0.0307, | |
| "reward": 1.0081698819994926, | |
| "reward_std": 1.0613654553890228, | |
| "rewards/cosine_scaled_reward": 0.10825158283114433, | |
| "rewards/format_reward": 0.7916667014360428, | |
| "step": 194 | |
| }, | |
| { | |
| "completion_length": 2374.9375610351562, | |
| "epoch": 0.22285714285714286, | |
| "grad_norm": 0.39783236384391785, | |
| "kl": 0.357666015625, | |
| "learning_rate": 7.884636689049422e-07, | |
| "loss": 0.0483, | |
| "reward": 0.519692053552717, | |
| "reward_std": 0.8805719166994095, | |
| "rewards/cosine_scaled_reward": -0.04223730321973562, | |
| "rewards/format_reward": 0.6041666716337204, | |
| "step": 195 | |
| }, | |
| { | |
| "completion_length": 2985.0208740234375, | |
| "epoch": 0.224, | |
| "grad_norm": 0.5228659510612488, | |
| "kl": 0.39453125, | |
| "learning_rate": 7.857936576865356e-07, | |
| "loss": 0.0626, | |
| "reward": 0.31096921616699547, | |
| "reward_std": 0.9736936837434769, | |
| "rewards/cosine_scaled_reward": -0.07368208467960358, | |
| "rewards/format_reward": 0.45833334140479565, | |
| "step": 196 | |
| }, | |
| { | |
| "completion_length": 3063.1459045410156, | |
| "epoch": 0.22514285714285714, | |
| "grad_norm": 0.4522063732147217, | |
| "kl": 0.39892578125, | |
| "learning_rate": 7.831121542179086e-07, | |
| "loss": 0.047, | |
| "reward": -0.027099967002868652, | |
| "reward_std": 0.7299272418022156, | |
| "rewards/cosine_scaled_reward": -0.14896666258573532, | |
| "rewards/format_reward": 0.27083334140479565, | |
| "step": 197 | |
| }, | |
| { | |
| "completion_length": 3130.291748046875, | |
| "epoch": 0.22628571428571428, | |
| "grad_norm": 0.7724531888961792, | |
| "kl": 0.40771484375, | |
| "learning_rate": 7.804192891917571e-07, | |
| "loss": 0.0823, | |
| "reward": 0.07385630160570145, | |
| "reward_std": 0.7986228317022324, | |
| "rewards/cosine_scaled_reward": -0.18182185851037502, | |
| "rewards/format_reward": 0.4375000074505806, | |
| "step": 198 | |
| }, | |
| { | |
| "completion_length": 2927.9375610351562, | |
| "epoch": 0.22742857142857142, | |
| "grad_norm": 0.4081217050552368, | |
| "kl": 0.40234375, | |
| "learning_rate": 7.777151938545235e-07, | |
| "loss": 0.0405, | |
| "reward": 0.9421972185373306, | |
| "reward_std": 0.8113018572330475, | |
| "rewards/cosine_scaled_reward": 0.18984858132898808, | |
| "rewards/format_reward": 0.5625000149011612, | |
| "step": 199 | |
| }, | |
| { | |
| "completion_length": 2594.6875610351562, | |
| "epoch": 0.22857142857142856, | |
| "grad_norm": 1.1233628988265991, | |
| "kl": 0.4052734375, | |
| "learning_rate": 7.75e-07, | |
| "loss": 0.1087, | |
| "reward": 0.4042139081284404, | |
| "reward_std": 0.9797720313072205, | |
| "rewards/cosine_scaled_reward": -0.06872639432549477, | |
| "rewards/format_reward": 0.5416666772216558, | |
| "step": 200 | |
| }, | |
| { | |
| "completion_length": 2749.729217529297, | |
| "epoch": 0.2297142857142857, | |
| "grad_norm": 0.4544771611690521, | |
| "kl": 0.463134765625, | |
| "learning_rate": 7.72273839962904e-07, | |
| "loss": 0.0332, | |
| "reward": 0.049620624631643295, | |
| "reward_std": 0.6019374430179596, | |
| "rewards/cosine_scaled_reward": -0.18352303700521588, | |
| "rewards/format_reward": 0.41666667722165585, | |
| "step": 201 | |
| }, | |
| { | |
| "completion_length": 2291.3334350585938, | |
| "epoch": 0.23085714285714284, | |
| "grad_norm": 0.4469071328639984, | |
| "kl": 0.4375, | |
| "learning_rate": 7.695368466124296e-07, | |
| "loss": 0.0543, | |
| "reward": 0.33724231645464897, | |
| "reward_std": 0.6383469551801682, | |
| "rewards/cosine_scaled_reward": -0.10221217246726155, | |
| "rewards/format_reward": 0.5416666716337204, | |
| "step": 202 | |
| }, | |
| { | |
| "completion_length": 2692.416717529297, | |
| "epoch": 0.232, | |
| "grad_norm": 0.9959556460380554, | |
| "kl": 0.60302734375, | |
| "learning_rate": 7.667891533457718e-07, | |
| "loss": 0.0229, | |
| "reward": 0.5023867785930634, | |
| "reward_std": 0.8520723432302475, | |
| "rewards/cosine_scaled_reward": -0.009223278611898422, | |
| "rewards/format_reward": 0.520833358168602, | |
| "step": 203 | |
| }, | |
| { | |
| "completion_length": 2610.291748046875, | |
| "epoch": 0.23314285714285715, | |
| "grad_norm": 0.5574892163276672, | |
| "kl": 0.54150390625, | |
| "learning_rate": 7.640308940816239e-07, | |
| "loss": 0.0779, | |
| "reward": 0.668186828494072, | |
| "reward_std": 0.7796131670475006, | |
| "rewards/cosine_scaled_reward": 0.04242673283442855, | |
| "rewards/format_reward": 0.5833333488553762, | |
| "step": 204 | |
| }, | |
| { | |
| "completion_length": 3158.0625, | |
| "epoch": 0.2342857142857143, | |
| "grad_norm": 0.9340919256210327, | |
| "kl": 0.658203125, | |
| "learning_rate": 7.612622032536507e-07, | |
| "loss": 0.091, | |
| "reward": 0.36712072789669037, | |
| "reward_std": 1.0377983078360558, | |
| "rewards/cosine_scaled_reward": -0.014356307685375214, | |
| "rewards/format_reward": 0.3958333507180214, | |
| "step": 205 | |
| }, | |
| { | |
| "completion_length": 3241.7500610351562, | |
| "epoch": 0.23542857142857143, | |
| "grad_norm": 0.6677758097648621, | |
| "kl": 0.7197265625, | |
| "learning_rate": 7.584832158039378e-07, | |
| "loss": 0.0745, | |
| "reward": -0.032032303512096405, | |
| "reward_std": 0.7234849855303764, | |
| "rewards/cosine_scaled_reward": -0.15143282152712345, | |
| "rewards/format_reward": 0.27083334885537624, | |
| "step": 206 | |
| }, | |
| { | |
| "completion_length": 2452.1876220703125, | |
| "epoch": 0.23657142857142857, | |
| "grad_norm": 0.9905790090560913, | |
| "kl": 0.4033203125, | |
| "learning_rate": 7.556940671764124e-07, | |
| "loss": 0.0568, | |
| "reward": 0.8914177902042866, | |
| "reward_std": 0.8338152915239334, | |
| "rewards/cosine_scaled_reward": 0.0811255220323801, | |
| "rewards/format_reward": 0.7291666865348816, | |
| "step": 207 | |
| }, | |
| { | |
| "completion_length": 2867.4583740234375, | |
| "epoch": 0.2377142857142857, | |
| "grad_norm": 1.0818088054656982, | |
| "kl": 0.6904296875, | |
| "learning_rate": 7.528948933102438e-07, | |
| "loss": 0.0301, | |
| "reward": 0.22067961934953928, | |
| "reward_std": 0.46099015325307846, | |
| "rewards/cosine_scaled_reward": -0.17091020289808512, | |
| "rewards/format_reward": 0.5625000149011612, | |
| "step": 208 | |
| }, | |
| { | |
| "completion_length": 2549.354217529297, | |
| "epoch": 0.23885714285714285, | |
| "grad_norm": 0.5277766585350037, | |
| "kl": 0.5927734375, | |
| "learning_rate": 7.500858306332172e-07, | |
| "loss": 0.0733, | |
| "reward": 0.142703301506117, | |
| "reward_std": 0.7169675379991531, | |
| "rewards/cosine_scaled_reward": -0.26198170334100723, | |
| "rewards/format_reward": 0.6666666716337204, | |
| "step": 209 | |
| }, | |
| { | |
| "completion_length": 2013.7500610351562, | |
| "epoch": 0.24, | |
| "grad_norm": 0.610791027545929, | |
| "kl": 0.40966796875, | |
| "learning_rate": 7.472670160550848e-07, | |
| "loss": 0.0513, | |
| "reward": 0.6129203364253044, | |
| "reward_std": 0.8901711851358414, | |
| "rewards/cosine_scaled_reward": -0.026873177848756313, | |
| "rewards/format_reward": 0.6666666772216558, | |
| "step": 210 | |
| }, | |
| { | |
| "completion_length": 3397.9376220703125, | |
| "epoch": 0.24114285714285713, | |
| "grad_norm": 0.8708758354187012, | |
| "kl": 0.751953125, | |
| "learning_rate": 7.444385869608921e-07, | |
| "loss": 0.0628, | |
| "reward": -0.10053645074367523, | |
| "reward_std": 0.5338989198207855, | |
| "rewards/cosine_scaled_reward": -0.14401823794469237, | |
| "rewards/format_reward": 0.1875000074505806, | |
| "step": 211 | |
| }, | |
| { | |
| "completion_length": 2320.8334045410156, | |
| "epoch": 0.2422857142857143, | |
| "grad_norm": 0.8576116561889648, | |
| "kl": 0.481201171875, | |
| "learning_rate": 7.416006812042827e-07, | |
| "loss": 0.0179, | |
| "reward": 0.7511888779699802, | |
| "reward_std": 0.8285558968782425, | |
| "rewards/cosine_scaled_reward": 0.021427758038043976, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 212 | |
| }, | |
| { | |
| "completion_length": 3072.2084350585938, | |
| "epoch": 0.24342857142857144, | |
| "grad_norm": 0.7516844272613525, | |
| "kl": 0.6279296875, | |
| "learning_rate": 7.387534371007797e-07, | |
| "loss": 0.0663, | |
| "reward": 0.14471609145402908, | |
| "reward_std": 0.5673011243343353, | |
| "rewards/cosine_scaled_reward": -0.09430863708257675, | |
| "rewards/format_reward": 0.3333333395421505, | |
| "step": 213 | |
| }, | |
| { | |
| "completion_length": 3004.666748046875, | |
| "epoch": 0.24457142857142858, | |
| "grad_norm": 0.650104820728302, | |
| "kl": 0.49853515625, | |
| "learning_rate": 7.358969934210438e-07, | |
| "loss": 0.048, | |
| "reward": 0.38014761358499527, | |
| "reward_std": 0.6449386551976204, | |
| "rewards/cosine_scaled_reward": -0.05992620065808296, | |
| "rewards/format_reward": 0.5000000055879354, | |
| "step": 214 | |
| }, | |
| { | |
| "completion_length": 2997.1251220703125, | |
| "epoch": 0.24571428571428572, | |
| "grad_norm": 0.8768295049667358, | |
| "kl": 0.55859375, | |
| "learning_rate": 7.330314893841101e-07, | |
| "loss": 0.0617, | |
| "reward": 0.14181075803935528, | |
| "reward_std": 0.7453153133392334, | |
| "rewards/cosine_scaled_reward": -0.21034463122487068, | |
| "rewards/format_reward": 0.5625000149011612, | |
| "step": 215 | |
| }, | |
| { | |
| "completion_length": 3182.6250610351562, | |
| "epoch": 0.24685714285714286, | |
| "grad_norm": 0.5447856187820435, | |
| "kl": 0.52685546875, | |
| "learning_rate": 7.301570646506027e-07, | |
| "loss": 0.0435, | |
| "reward": -0.2610638588666916, | |
| "reward_std": 0.5414926931262016, | |
| "rewards/cosine_scaled_reward": -0.2451152689754963, | |
| "rewards/format_reward": 0.2291666679084301, | |
| "step": 216 | |
| }, | |
| { | |
| "completion_length": 2864.8334350585938, | |
| "epoch": 0.248, | |
| "grad_norm": 0.5242255330085754, | |
| "kl": 0.46875, | |
| "learning_rate": 7.27273859315928e-07, | |
| "loss": 0.0353, | |
| "reward": 0.28853584825992584, | |
| "reward_std": 0.5657162964344025, | |
| "rewards/cosine_scaled_reward": -0.11614875216037035, | |
| "rewards/format_reward": 0.520833358168602, | |
| "step": 217 | |
| }, | |
| { | |
| "completion_length": 2654.9583740234375, | |
| "epoch": 0.24914285714285714, | |
| "grad_norm": 0.9366975426673889, | |
| "kl": 0.392578125, | |
| "learning_rate": 7.243820139034464e-07, | |
| "loss": 0.0515, | |
| "reward": 0.3301328122615814, | |
| "reward_std": 0.7091851830482483, | |
| "rewards/cosine_scaled_reward": -0.04326693775783497, | |
| "rewards/format_reward": 0.4166666828095913, | |
| "step": 218 | |
| }, | |
| { | |
| "completion_length": 2303.854217529297, | |
| "epoch": 0.2502857142857143, | |
| "grad_norm": 1.7971564531326294, | |
| "kl": 0.3369140625, | |
| "learning_rate": 7.214816693576234e-07, | |
| "loss": 0.0794, | |
| "reward": 0.6591267697513103, | |
| "reward_std": 0.9642367362976074, | |
| "rewards/cosine_scaled_reward": 0.03789670951664448, | |
| "rewards/format_reward": 0.583333358168602, | |
| "step": 219 | |
| }, | |
| { | |
| "completion_length": 2634.2501220703125, | |
| "epoch": 0.25142857142857145, | |
| "grad_norm": 1.3504126071929932, | |
| "kl": 0.4423828125, | |
| "learning_rate": 7.185729670371604e-07, | |
| "loss": -0.0076, | |
| "reward": 0.41383227705955505, | |
| "reward_std": 0.64960727840662, | |
| "rewards/cosine_scaled_reward": -0.043083855882287025, | |
| "rewards/format_reward": 0.5000000204890966, | |
| "step": 220 | |
| }, | |
| { | |
| "completion_length": 2984.7709350585938, | |
| "epoch": 0.25257142857142856, | |
| "grad_norm": 0.9762473106384277, | |
| "kl": 0.4384765625, | |
| "learning_rate": 7.156560487081051e-07, | |
| "loss": 0.0725, | |
| "reward": 0.25423768046312034, | |
| "reward_std": 0.8094103336334229, | |
| "rewards/cosine_scaled_reward": -0.11246450617909431, | |
| "rewards/format_reward": 0.4791666716337204, | |
| "step": 221 | |
| }, | |
| { | |
| "completion_length": 3275.7500610351562, | |
| "epoch": 0.2537142857142857, | |
| "grad_norm": 0.37796396017074585, | |
| "kl": 0.533203125, | |
| "learning_rate": 7.127310565369415e-07, | |
| "loss": 0.0546, | |
| "reward": 0.08308765979018062, | |
| "reward_std": 0.6242133527994156, | |
| "rewards/cosine_scaled_reward": -0.18762284144759178, | |
| "rewards/format_reward": 0.4583333395421505, | |
| "step": 222 | |
| }, | |
| { | |
| "completion_length": 2415.2916870117188, | |
| "epoch": 0.25485714285714284, | |
| "grad_norm": 66.52708435058594, | |
| "kl": 19.72021484375, | |
| "learning_rate": 7.097981330836616e-07, | |
| "loss": 0.1598, | |
| "reward": 0.32358624786138535, | |
| "reward_std": 0.8794360756874084, | |
| "rewards/cosine_scaled_reward": -0.12987355142831802, | |
| "rewards/format_reward": 0.5833333507180214, | |
| "step": 223 | |
| }, | |
| { | |
| "completion_length": 2914.2084350585938, | |
| "epoch": 0.256, | |
| "grad_norm": 0.39709535241127014, | |
| "kl": 0.42919921875, | |
| "learning_rate": 7.068574212948169e-07, | |
| "loss": 0.026, | |
| "reward": 0.4726352207362652, | |
| "reward_std": 0.5715819150209427, | |
| "rewards/cosine_scaled_reward": -0.02409905381500721, | |
| "rewards/format_reward": 0.5208333432674408, | |
| "step": 224 | |
| }, | |
| { | |
| "completion_length": 2191.4584045410156, | |
| "epoch": 0.2571428571428571, | |
| "grad_norm": 1.4947963953018188, | |
| "kl": 0.361572265625, | |
| "learning_rate": 7.039090644965509e-07, | |
| "loss": 0.0904, | |
| "reward": 0.8724448978900909, | |
| "reward_std": 0.8835494965314865, | |
| "rewards/cosine_scaled_reward": 0.16538911685347557, | |
| "rewards/format_reward": 0.541666679084301, | |
| "step": 225 | |
| }, | |
| { | |
| "completion_length": 2944.8959350585938, | |
| "epoch": 0.2582857142857143, | |
| "grad_norm": 0.8030902147293091, | |
| "kl": 0.5966796875, | |
| "learning_rate": 7.009532063876148e-07, | |
| "loss": 0.0354, | |
| "reward": 0.16449306067079306, | |
| "reward_std": 0.7553341090679169, | |
| "rewards/cosine_scaled_reward": -0.14692013710737228, | |
| "rewards/format_reward": 0.4583333432674408, | |
| "step": 226 | |
| }, | |
| { | |
| "completion_length": 2399.1251220703125, | |
| "epoch": 0.25942857142857145, | |
| "grad_norm": 0.6294677257537842, | |
| "kl": 0.40478515625, | |
| "learning_rate": 6.979899910323624e-07, | |
| "loss": 0.0385, | |
| "reward": 0.6515897959470749, | |
| "reward_std": 0.7883607298135757, | |
| "rewards/cosine_scaled_reward": -0.01795511320233345, | |
| "rewards/format_reward": 0.6875000149011612, | |
| "step": 227 | |
| }, | |
| { | |
| "completion_length": 2944.2916870117188, | |
| "epoch": 0.26057142857142856, | |
| "grad_norm": 0.7098054885864258, | |
| "kl": 0.5126953125, | |
| "learning_rate": 6.950195628537299e-07, | |
| "loss": 0.0537, | |
| "reward": 0.2890019528567791, | |
| "reward_std": 0.8232990577816963, | |
| "rewards/cosine_scaled_reward": -0.1367490328848362, | |
| "rewards/format_reward": 0.5625, | |
| "step": 228 | |
| }, | |
| { | |
| "completion_length": 2747.541748046875, | |
| "epoch": 0.26171428571428573, | |
| "grad_norm": 0.3639421761035919, | |
| "kl": 0.53759765625, | |
| "learning_rate": 6.920420666261961e-07, | |
| "loss": 0.0462, | |
| "reward": 0.1284541985951364, | |
| "reward_std": 0.6105376034975052, | |
| "rewards/cosine_scaled_reward": -0.21702291443943977, | |
| "rewards/format_reward": 0.5625000149011612, | |
| "step": 229 | |
| }, | |
| { | |
| "completion_length": 2547.916748046875, | |
| "epoch": 0.26285714285714284, | |
| "grad_norm": 0.7889376878738403, | |
| "kl": 0.4453125, | |
| "learning_rate": 6.890576474687263e-07, | |
| "loss": 0.0666, | |
| "reward": 0.46958625549450517, | |
| "reward_std": 0.8848246484994888, | |
| "rewards/cosine_scaled_reward": 0.03687644610181451, | |
| "rewards/format_reward": 0.3958333395421505, | |
| "step": 230 | |
| }, | |
| { | |
| "completion_length": 2979.3125610351562, | |
| "epoch": 0.264, | |
| "grad_norm": 0.49910208582878113, | |
| "kl": 0.56689453125, | |
| "learning_rate": 6.860664508377001e-07, | |
| "loss": 0.0658, | |
| "reward": 0.34871126525104046, | |
| "reward_std": 0.7629459947347641, | |
| "rewards/cosine_scaled_reward": -0.0756443589925766, | |
| "rewards/format_reward": 0.5000000149011612, | |
| "step": 231 | |
| }, | |
| { | |
| "completion_length": 2503.5625610351562, | |
| "epoch": 0.2651428571428571, | |
| "grad_norm": 0.8284872174263, | |
| "kl": 0.412109375, | |
| "learning_rate": 6.83068622519821e-07, | |
| "loss": 0.0204, | |
| "reward": 0.6350362692028284, | |
| "reward_std": 1.1135509312152863, | |
| "rewards/cosine_scaled_reward": -0.02623187005519867, | |
| "rewards/format_reward": 0.6875000149011612, | |
| "step": 232 | |
| }, | |
| { | |
| "completion_length": 2727.8751220703125, | |
| "epoch": 0.2662857142857143, | |
| "grad_norm": 0.5221201181411743, | |
| "kl": 0.4931640625, | |
| "learning_rate": 6.800643086250121e-07, | |
| "loss": 0.0615, | |
| "reward": 0.4846220053732395, | |
| "reward_std": 0.7716068103909492, | |
| "rewards/cosine_scaled_reward": -0.049355676397681236, | |
| "rewards/format_reward": 0.5833333432674408, | |
| "step": 233 | |
| }, | |
| { | |
| "completion_length": 2544.7084350585938, | |
| "epoch": 0.2674285714285714, | |
| "grad_norm": 1.3812953233718872, | |
| "kl": 0.498046875, | |
| "learning_rate": 6.770536555792944e-07, | |
| "loss": -0.0119, | |
| "reward": 0.4157133437693119, | |
| "reward_std": 0.7185128927230835, | |
| "rewards/cosine_scaled_reward": -0.13589332532137632, | |
| "rewards/format_reward": 0.6875000149011612, | |
| "step": 234 | |
| }, | |
| { | |
| "completion_length": 2495.375045776367, | |
| "epoch": 0.26857142857142857, | |
| "grad_norm": 0.6437314748764038, | |
| "kl": 0.59716796875, | |
| "learning_rate": 6.740368101176495e-07, | |
| "loss": 0.0412, | |
| "reward": 0.5019040778279305, | |
| "reward_std": 0.6978631764650345, | |
| "rewards/cosine_scaled_reward": -0.019881299696862698, | |
| "rewards/format_reward": 0.5416666753590107, | |
| "step": 235 | |
| }, | |
| { | |
| "completion_length": 2483.3959350585938, | |
| "epoch": 0.26971428571428574, | |
| "grad_norm": 0.3919011950492859, | |
| "kl": 0.4892578125, | |
| "learning_rate": 6.710139192768694e-07, | |
| "loss": 0.0482, | |
| "reward": 0.2438975148834288, | |
| "reward_std": 0.648132249712944, | |
| "rewards/cosine_scaled_reward": -0.2009679153561592, | |
| "rewards/format_reward": 0.6458333656191826, | |
| "step": 236 | |
| }, | |
| { | |
| "completion_length": 2204.4584350585938, | |
| "epoch": 0.27085714285714285, | |
| "grad_norm": 0.8478395342826843, | |
| "kl": 0.39111328125, | |
| "learning_rate": 6.679851303883891e-07, | |
| "loss": 0.0545, | |
| "reward": 0.42290161666460335, | |
| "reward_std": 0.648314818739891, | |
| "rewards/cosine_scaled_reward": -0.06979918852448463, | |
| "rewards/format_reward": 0.5625000149011612, | |
| "step": 237 | |
| }, | |
| { | |
| "completion_length": 2635.062530517578, | |
| "epoch": 0.272, | |
| "grad_norm": 1.0054919719696045, | |
| "kl": 0.572265625, | |
| "learning_rate": 6.649505910711058e-07, | |
| "loss": 0.0721, | |
| "reward": 0.5835281796753407, | |
| "reward_std": 0.7386454343795776, | |
| "rewards/cosine_scaled_reward": -0.09365258179605007, | |
| "rewards/format_reward": 0.770833358168602, | |
| "step": 238 | |
| }, | |
| { | |
| "completion_length": 3080.6875610351562, | |
| "epoch": 0.27314285714285713, | |
| "grad_norm": 0.8045799136161804, | |
| "kl": 0.7119140625, | |
| "learning_rate": 6.619104492241847e-07, | |
| "loss": 0.0514, | |
| "reward": 0.16217913012951612, | |
| "reward_std": 0.8966347873210907, | |
| "rewards/cosine_scaled_reward": -0.1376604586839676, | |
| "rewards/format_reward": 0.43750000558793545, | |
| "step": 239 | |
| }, | |
| { | |
| "completion_length": 2219.5000915527344, | |
| "epoch": 0.2742857142857143, | |
| "grad_norm": 1.3121085166931152, | |
| "kl": 0.403564453125, | |
| "learning_rate": 6.588648530198504e-07, | |
| "loss": 0.0747, | |
| "reward": 1.15125173330307, | |
| "reward_std": 0.957096055150032, | |
| "rewards/cosine_scaled_reward": 0.200625860132277, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 240 | |
| }, | |
| { | |
| "completion_length": 2043.0625610351562, | |
| "epoch": 0.2754285714285714, | |
| "grad_norm": 0.6292615532875061, | |
| "kl": 0.319580078125, | |
| "learning_rate": 6.558139508961654e-07, | |
| "loss": 0.0002, | |
| "reward": 0.9169554859399796, | |
| "reward_std": 0.5727524533867836, | |
| "rewards/cosine_scaled_reward": 0.07306107506155968, | |
| "rewards/format_reward": 0.770833358168602, | |
| "step": 241 | |
| }, | |
| { | |
| "completion_length": 2313.4583740234375, | |
| "epoch": 0.2765714285714286, | |
| "grad_norm": 0.6727687120437622, | |
| "kl": 0.4599609375, | |
| "learning_rate": 6.527578915497951e-07, | |
| "loss": 0.0547, | |
| "reward": 0.634780153632164, | |
| "reward_std": 0.7665407210588455, | |
| "rewards/cosine_scaled_reward": -0.0992765948176384, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 242 | |
| }, | |
| { | |
| "completion_length": 2128.354248046875, | |
| "epoch": 0.2777142857142857, | |
| "grad_norm": 2.353132963180542, | |
| "kl": 0.46875, | |
| "learning_rate": 6.496968239287603e-07, | |
| "loss": -0.0288, | |
| "reward": 0.7288870755583048, | |
| "reward_std": 0.7078111618757248, | |
| "rewards/cosine_scaled_reward": -0.020973138511180878, | |
| "rewards/format_reward": 0.770833358168602, | |
| "step": 243 | |
| }, | |
| { | |
| "completion_length": 2385.729217529297, | |
| "epoch": 0.27885714285714286, | |
| "grad_norm": 0.797772228717804, | |
| "kl": 0.435546875, | |
| "learning_rate": 6.466308972251785e-07, | |
| "loss": 0.0694, | |
| "reward": 0.9379732981324196, | |
| "reward_std": 0.76512710750103, | |
| "rewards/cosine_scaled_reward": 0.08356995694339275, | |
| "rewards/format_reward": 0.770833358168602, | |
| "step": 244 | |
| }, | |
| { | |
| "completion_length": 2175.229217529297, | |
| "epoch": 0.28, | |
| "grad_norm": 0.4513607621192932, | |
| "kl": 0.4609375, | |
| "learning_rate": 6.435602608679916e-07, | |
| "loss": 0.0361, | |
| "reward": 0.7639665333554149, | |
| "reward_std": 0.5898980349302292, | |
| "rewards/cosine_scaled_reward": -0.0034334324300289154, | |
| "rewards/format_reward": 0.770833358168602, | |
| "step": 245 | |
| }, | |
| { | |
| "completion_length": 2385.3959045410156, | |
| "epoch": 0.28114285714285714, | |
| "grad_norm": 1.354136347770691, | |
| "kl": 0.4619140625, | |
| "learning_rate": 6.404850645156841e-07, | |
| "loss": -0.0114, | |
| "reward": 0.5757800415158272, | |
| "reward_std": 0.4861333817243576, | |
| "rewards/cosine_scaled_reward": -0.09752664715051651, | |
| "rewards/format_reward": 0.7708333432674408, | |
| "step": 246 | |
| }, | |
| { | |
| "completion_length": 1984.0834045410156, | |
| "epoch": 0.2822857142857143, | |
| "grad_norm": 0.7202406525611877, | |
| "kl": 0.39306640625, | |
| "learning_rate": 6.374054580489873e-07, | |
| "loss": -0.0064, | |
| "reward": 0.7016956266015768, | |
| "reward_std": 0.6964651569724083, | |
| "rewards/cosine_scaled_reward": -0.03456886112689972, | |
| "rewards/format_reward": 0.770833358168602, | |
| "step": 247 | |
| }, | |
| { | |
| "completion_length": 2431.6250610351562, | |
| "epoch": 0.2834285714285714, | |
| "grad_norm": 1.12034273147583, | |
| "kl": 0.375, | |
| "learning_rate": 6.343215915635761e-07, | |
| "loss": 0.0399, | |
| "reward": 0.47921356186270714, | |
| "reward_std": 0.7437918186187744, | |
| "rewards/cosine_scaled_reward": -0.10414323909208179, | |
| "rewards/format_reward": 0.6875000149011612, | |
| "step": 248 | |
| }, | |
| { | |
| "completion_length": 2641.5000610351562, | |
| "epoch": 0.2845714285714286, | |
| "grad_norm": 1.147722601890564, | |
| "kl": 0.466796875, | |
| "learning_rate": 6.31233615362752e-07, | |
| "loss": 0.0084, | |
| "reward": 0.4995560571551323, | |
| "reward_std": 0.7342625856399536, | |
| "rewards/cosine_scaled_reward": -0.04188864305615425, | |
| "rewards/format_reward": 0.583333358168602, | |
| "step": 249 | |
| }, | |
| { | |
| "completion_length": 2112.0209045410156, | |
| "epoch": 0.2857142857142857, | |
| "grad_norm": 0.6532469987869263, | |
| "kl": 0.302490234375, | |
| "learning_rate": 6.281416799501187e-07, | |
| "loss": 0.0292, | |
| "reward": 0.6722276238724589, | |
| "reward_std": 1.072887122631073, | |
| "rewards/cosine_scaled_reward": 0.03403047751635313, | |
| "rewards/format_reward": 0.6041666865348816, | |
| "step": 250 | |
| }, | |
| { | |
| "completion_length": 2693.416748046875, | |
| "epoch": 0.28685714285714287, | |
| "grad_norm": 0.9663844108581543, | |
| "kl": 0.419921875, | |
| "learning_rate": 6.25045936022246e-07, | |
| "loss": 0.0569, | |
| "reward": 0.9957753866910934, | |
| "reward_std": 0.9329462796449661, | |
| "rewards/cosine_scaled_reward": 0.16455435939133167, | |
| "rewards/format_reward": 0.6666666865348816, | |
| "step": 251 | |
| }, | |
| { | |
| "completion_length": 2677.7709350585938, | |
| "epoch": 0.288, | |
| "grad_norm": 0.720365583896637, | |
| "kl": 0.42138671875, | |
| "learning_rate": 6.219465344613258e-07, | |
| "loss": 0.0584, | |
| "reward": 0.791405975818634, | |
| "reward_std": 0.8207461088895798, | |
| "rewards/cosine_scaled_reward": -0.010547026991844177, | |
| "rewards/format_reward": 0.8125000298023224, | |
| "step": 252 | |
| }, | |
| { | |
| "completion_length": 2163.2709350585938, | |
| "epoch": 0.28914285714285715, | |
| "grad_norm": 0.9754706025123596, | |
| "kl": 0.333984375, | |
| "learning_rate": 6.188436263278172e-07, | |
| "loss": -0.0284, | |
| "reward": 0.4755242392420769, | |
| "reward_std": 0.9357906579971313, | |
| "rewards/cosine_scaled_reward": -0.05390455946326256, | |
| "rewards/format_reward": 0.583333358168602, | |
| "step": 253 | |
| }, | |
| { | |
| "completion_length": 2022.0834045410156, | |
| "epoch": 0.29028571428571426, | |
| "grad_norm": 0.7189564108848572, | |
| "kl": 0.29931640625, | |
| "learning_rate": 6.157373628530852e-07, | |
| "loss": -0.0134, | |
| "reward": 1.0547878816723824, | |
| "reward_std": 0.6990637332201004, | |
| "rewards/cosine_scaled_reward": 0.162810567766428, | |
| "rewards/format_reward": 0.7291666865348816, | |
| "step": 254 | |
| }, | |
| { | |
| "completion_length": 2299.6041870117188, | |
| "epoch": 0.2914285714285714, | |
| "grad_norm": 0.6565377712249756, | |
| "kl": 0.3150634765625, | |
| "learning_rate": 6.126278954320294e-07, | |
| "loss": 0.0082, | |
| "reward": 0.9156973995268345, | |
| "reward_std": 0.7535882145166397, | |
| "rewards/cosine_scaled_reward": 0.08284871588693932, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 255 | |
| }, | |
| { | |
| "completion_length": 1579.4583892822266, | |
| "epoch": 0.2925714285714286, | |
| "grad_norm": 0.25218111276626587, | |
| "kl": 0.13134765625, | |
| "learning_rate": 6.095153756157051e-07, | |
| "loss": -0.0037, | |
| "reward": 0.6594964060932398, | |
| "reward_std": 0.7463338524103165, | |
| "rewards/cosine_scaled_reward": -0.04525182023644447, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 256 | |
| }, | |
| { | |
| "completion_length": 2658.6876220703125, | |
| "epoch": 0.2937142857142857, | |
| "grad_norm": 0.475395530462265, | |
| "kl": 0.332275390625, | |
| "learning_rate": 6.06399955103937e-07, | |
| "loss": 0.0439, | |
| "reward": 0.4807323142886162, | |
| "reward_std": 0.7335182875394821, | |
| "rewards/cosine_scaled_reward": -0.1450505219399929, | |
| "rewards/format_reward": 0.7708333432674408, | |
| "step": 257 | |
| }, | |
| { | |
| "completion_length": 2290.5000610351562, | |
| "epoch": 0.2948571428571429, | |
| "grad_norm": 0.5613760948181152, | |
| "kl": 0.2305908203125, | |
| "learning_rate": 6.032817857379256e-07, | |
| "loss": 0.0305, | |
| "reward": 0.5192163055762649, | |
| "reward_std": 0.7799556702375412, | |
| "rewards/cosine_scaled_reward": -0.021641843486577272, | |
| "rewards/format_reward": 0.5625000149011612, | |
| "step": 258 | |
| }, | |
| { | |
| "completion_length": 2217.8959350585938, | |
| "epoch": 0.296, | |
| "grad_norm": 1.199144959449768, | |
| "kl": 0.24127197265625, | |
| "learning_rate": 6.001610194928464e-07, | |
| "loss": 0.049, | |
| "reward": 0.5793692320585251, | |
| "reward_std": 0.7019505053758621, | |
| "rewards/cosine_scaled_reward": -0.09573205607011914, | |
| "rewards/format_reward": 0.7708333432674408, | |
| "step": 259 | |
| }, | |
| { | |
| "completion_length": 2786.041748046875, | |
| "epoch": 0.29714285714285715, | |
| "grad_norm": 0.7002319693565369, | |
| "kl": 0.292236328125, | |
| "learning_rate": 5.97037808470444e-07, | |
| "loss": 0.0086, | |
| "reward": 0.5236682705581188, | |
| "reward_std": 0.5017373934388161, | |
| "rewards/cosine_scaled_reward": -0.10274921730160713, | |
| "rewards/format_reward": 0.7291666865348816, | |
| "step": 260 | |
| }, | |
| { | |
| "completion_length": 2164.291717529297, | |
| "epoch": 0.29828571428571427, | |
| "grad_norm": 0.2812724709510803, | |
| "kl": 0.186279296875, | |
| "learning_rate": 5.939123048916173e-07, | |
| "loss": 0.0171, | |
| "reward": 0.6918911039829254, | |
| "reward_std": 0.4820164740085602, | |
| "rewards/cosine_scaled_reward": -0.07072112709283829, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 261 | |
| }, | |
| { | |
| "completion_length": 2519.5625610351562, | |
| "epoch": 0.29942857142857143, | |
| "grad_norm": 0.4466201663017273, | |
| "kl": 0.25927734375, | |
| "learning_rate": 5.907846610890011e-07, | |
| "loss": 0.0037, | |
| "reward": 0.45665838569402695, | |
| "reward_std": 0.7808536291122437, | |
| "rewards/cosine_scaled_reward": -0.14667082950472832, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 262 | |
| }, | |
| { | |
| "completion_length": 2283.0208435058594, | |
| "epoch": 0.30057142857142854, | |
| "grad_norm": 0.9734614491462708, | |
| "kl": 0.24072265625, | |
| "learning_rate": 5.87655029499542e-07, | |
| "loss": -0.0445, | |
| "reward": 0.6200529932975769, | |
| "reward_std": 0.9734015464782715, | |
| "rewards/cosine_scaled_reward": -0.05455685779452324, | |
| "rewards/format_reward": 0.7291666716337204, | |
| "step": 263 | |
| }, | |
| { | |
| "completion_length": 2269.729248046875, | |
| "epoch": 0.3017142857142857, | |
| "grad_norm": 0.93758225440979, | |
| "kl": 0.242919921875, | |
| "learning_rate": 5.845235626570683e-07, | |
| "loss": 0.0552, | |
| "reward": 0.5712921991944313, | |
| "reward_std": 0.6152775660157204, | |
| "rewards/cosine_scaled_reward": -0.0789372380822897, | |
| "rewards/format_reward": 0.7291666865348816, | |
| "step": 264 | |
| }, | |
| { | |
| "completion_length": 2714.729248046875, | |
| "epoch": 0.3028571428571429, | |
| "grad_norm": 0.4690639078617096, | |
| "kl": 0.28564453125, | |
| "learning_rate": 5.813904131848564e-07, | |
| "loss": 0.0054, | |
| "reward": 0.33216356858611107, | |
| "reward_std": 0.5296753197908401, | |
| "rewards/cosine_scaled_reward": -0.11516822502017021, | |
| "rewards/format_reward": 0.5625000223517418, | |
| "step": 265 | |
| }, | |
| { | |
| "completion_length": 2834.8750610351562, | |
| "epoch": 0.304, | |
| "grad_norm": 0.6644603610038757, | |
| "kl": 0.278076171875, | |
| "learning_rate": 5.78255733788191e-07, | |
| "loss": 0.0086, | |
| "reward": 0.7553704380989075, | |
| "reward_std": 0.6663154512643814, | |
| "rewards/cosine_scaled_reward": -0.059814791195094585, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 266 | |
| }, | |
| { | |
| "completion_length": 2623.7291870117188, | |
| "epoch": 0.30514285714285716, | |
| "grad_norm": 0.4014008343219757, | |
| "kl": 0.30078125, | |
| "learning_rate": 5.751196772469237e-07, | |
| "loss": 0.0276, | |
| "reward": 0.574170459061861, | |
| "reward_std": 0.6768613308668137, | |
| "rewards/cosine_scaled_reward": -0.046248115599155426, | |
| "rewards/format_reward": 0.6666666865348816, | |
| "step": 267 | |
| }, | |
| { | |
| "completion_length": 2934.2916870117188, | |
| "epoch": 0.3062857142857143, | |
| "grad_norm": 0.32006382942199707, | |
| "kl": 0.24169921875, | |
| "learning_rate": 5.71982396408026e-07, | |
| "loss": 0.0186, | |
| "reward": 0.5890230983495712, | |
| "reward_std": 0.6336611211299896, | |
| "rewards/cosine_scaled_reward": -0.0388217861764133, | |
| "rewards/format_reward": 0.666666679084301, | |
| "step": 268 | |
| }, | |
| { | |
| "completion_length": 2591.6459350585938, | |
| "epoch": 0.30742857142857144, | |
| "grad_norm": 0.2750188410282135, | |
| "kl": 0.2086181640625, | |
| "learning_rate": 5.688440441781398e-07, | |
| "loss": 0.0096, | |
| "reward": 0.4631531648337841, | |
| "reward_std": 0.5730658769607544, | |
| "rewards/cosine_scaled_reward": -0.15384008269757032, | |
| "rewards/format_reward": 0.7708333432674408, | |
| "step": 269 | |
| }, | |
| { | |
| "completion_length": 1949.6250305175781, | |
| "epoch": 0.30857142857142855, | |
| "grad_norm": 0.3348838686943054, | |
| "kl": 0.14434814453125, | |
| "learning_rate": 5.657047735161255e-07, | |
| "loss": 0.0019, | |
| "reward": 1.0058863386511803, | |
| "reward_std": 0.6113419234752655, | |
| "rewards/cosine_scaled_reward": 0.05502649489790201, | |
| "rewards/format_reward": 0.895833358168602, | |
| "step": 270 | |
| }, | |
| { | |
| "completion_length": 2595.8333740234375, | |
| "epoch": 0.3097142857142857, | |
| "grad_norm": 0.3792303502559662, | |
| "kl": 0.18743896484375, | |
| "learning_rate": 5.625647374256061e-07, | |
| "loss": 0.0156, | |
| "reward": 1.184450313448906, | |
| "reward_std": 0.6347895562648773, | |
| "rewards/cosine_scaled_reward": 0.18597513809800148, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 271 | |
| }, | |
| { | |
| "completion_length": 3300.4583740234375, | |
| "epoch": 0.31085714285714283, | |
| "grad_norm": 0.4754711091518402, | |
| "kl": 0.2998046875, | |
| "learning_rate": 5.594240889475106e-07, | |
| "loss": 0.0323, | |
| "reward": 0.33772575482726097, | |
| "reward_std": 0.7981042563915253, | |
| "rewards/cosine_scaled_reward": -0.12280379980802536, | |
| "rewards/format_reward": 0.583333358168602, | |
| "step": 272 | |
| }, | |
| { | |
| "completion_length": 2806.5208740234375, | |
| "epoch": 0.312, | |
| "grad_norm": 0.2589206397533417, | |
| "kl": 0.203857421875, | |
| "learning_rate": 5.562829811526154e-07, | |
| "loss": 0.0018, | |
| "reward": 0.4326868951320648, | |
| "reward_std": 0.6429417282342911, | |
| "rewards/cosine_scaled_reward": -0.1378232277929783, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 273 | |
| }, | |
| { | |
| "completion_length": 2775.6043090820312, | |
| "epoch": 0.31314285714285717, | |
| "grad_norm": 0.392734557390213, | |
| "kl": 0.182861328125, | |
| "learning_rate": 5.531415671340826e-07, | |
| "loss": 0.0352, | |
| "reward": 0.39707405120134354, | |
| "reward_std": 0.748130202293396, | |
| "rewards/cosine_scaled_reward": -0.11396298557519913, | |
| "rewards/format_reward": 0.6250000223517418, | |
| "step": 274 | |
| }, | |
| { | |
| "completion_length": 2929.979278564453, | |
| "epoch": 0.3142857142857143, | |
| "grad_norm": 0.700515627861023, | |
| "kl": 0.240478515625, | |
| "learning_rate": 5.5e-07, | |
| "loss": 0.0581, | |
| "reward": 0.3950451835989952, | |
| "reward_std": 0.9513901323080063, | |
| "rewards/cosine_scaled_reward": -0.06289407718577422, | |
| "rewards/format_reward": 0.5208333432674408, | |
| "step": 275 | |
| }, | |
| { | |
| "completion_length": 2392.2291870117188, | |
| "epoch": 0.31542857142857145, | |
| "grad_norm": 0.6831299066543579, | |
| "kl": 0.146484375, | |
| "learning_rate": 5.468584328659172e-07, | |
| "loss": 0.0305, | |
| "reward": 0.8106965273618698, | |
| "reward_std": 0.8061726838350296, | |
| "rewards/cosine_scaled_reward": -0.011318429373204708, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 276 | |
| }, | |
| { | |
| "completion_length": 2669.7709350585938, | |
| "epoch": 0.31657142857142856, | |
| "grad_norm": 1.2274115085601807, | |
| "kl": 0.221435546875, | |
| "learning_rate": 5.437170188473847e-07, | |
| "loss": 0.0847, | |
| "reward": 0.44736091792583466, | |
| "reward_std": 0.8726006895303726, | |
| "rewards/cosine_scaled_reward": -0.09923620894551277, | |
| "rewards/format_reward": 0.6458333432674408, | |
| "step": 277 | |
| }, | |
| { | |
| "completion_length": 2377.7500915527344, | |
| "epoch": 0.3177142857142857, | |
| "grad_norm": 0.6143187284469604, | |
| "kl": 0.225341796875, | |
| "learning_rate": 5.405759110524894e-07, | |
| "loss": 0.0193, | |
| "reward": 0.5976903513073921, | |
| "reward_std": 0.974912166595459, | |
| "rewards/cosine_scaled_reward": -0.0032381737837567925, | |
| "rewards/format_reward": 0.6041666865348816, | |
| "step": 278 | |
| }, | |
| { | |
| "completion_length": 2511.2083740234375, | |
| "epoch": 0.31885714285714284, | |
| "grad_norm": 0.7699910998344421, | |
| "kl": 0.2982177734375, | |
| "learning_rate": 5.37435262574394e-07, | |
| "loss": 0.0299, | |
| "reward": 0.3957599774003029, | |
| "reward_std": 0.8634193539619446, | |
| "rewards/cosine_scaled_reward": -0.10420336201786995, | |
| "rewards/format_reward": 0.604166679084301, | |
| "step": 279 | |
| }, | |
| { | |
| "completion_length": 2782.8750610351562, | |
| "epoch": 0.32, | |
| "grad_norm": 0.9926307201385498, | |
| "kl": 0.310791015625, | |
| "learning_rate": 5.342952264838747e-07, | |
| "loss": 0.0676, | |
| "reward": 0.6104128423612565, | |
| "reward_std": 0.8384141325950623, | |
| "rewards/cosine_scaled_reward": -0.028126917779445648, | |
| "rewards/format_reward": 0.6666666865348816, | |
| "step": 280 | |
| }, | |
| { | |
| "completion_length": 2380.4584350585938, | |
| "epoch": 0.3211428571428571, | |
| "grad_norm": 0.883975088596344, | |
| "kl": 0.2364501953125, | |
| "learning_rate": 5.311559558218603e-07, | |
| "loss": -0.0298, | |
| "reward": 0.6390588581562042, | |
| "reward_std": 0.7505539357662201, | |
| "rewards/cosine_scaled_reward": -0.055470582097768784, | |
| "rewards/format_reward": 0.7500000074505806, | |
| "step": 281 | |
| }, | |
| { | |
| "completion_length": 2751.1666870117188, | |
| "epoch": 0.3222857142857143, | |
| "grad_norm": 0.6628551483154297, | |
| "kl": 0.340087890625, | |
| "learning_rate": 5.28017603591974e-07, | |
| "loss": 0.0545, | |
| "reward": 0.8024181574583054, | |
| "reward_std": 0.9694567918777466, | |
| "rewards/cosine_scaled_reward": 0.026209060102701187, | |
| "rewards/format_reward": 0.7500000298023224, | |
| "step": 282 | |
| }, | |
| { | |
| "completion_length": 2520.1458740234375, | |
| "epoch": 0.32342857142857145, | |
| "grad_norm": 0.5402534604072571, | |
| "kl": 0.352783203125, | |
| "learning_rate": 5.248803227530763e-07, | |
| "loss": 0.0129, | |
| "reward": 0.4531768709421158, | |
| "reward_std": 0.6381779089570045, | |
| "rewards/cosine_scaled_reward": -0.1588282436132431, | |
| "rewards/format_reward": 0.770833358168602, | |
| "step": 283 | |
| }, | |
| { | |
| "completion_length": 2361.5625, | |
| "epoch": 0.32457142857142857, | |
| "grad_norm": 0.7840125560760498, | |
| "kl": 0.43798828125, | |
| "learning_rate": 5.21744266211809e-07, | |
| "loss": 0.0189, | |
| "reward": 0.3853081315755844, | |
| "reward_std": 0.7855608388781548, | |
| "rewards/cosine_scaled_reward": -0.07817927654832602, | |
| "rewards/format_reward": 0.5416666716337204, | |
| "step": 284 | |
| }, | |
| { | |
| "completion_length": 2909.3751220703125, | |
| "epoch": 0.32571428571428573, | |
| "grad_norm": 0.543645441532135, | |
| "kl": 0.51806640625, | |
| "learning_rate": 5.186095868151436e-07, | |
| "loss": 0.059, | |
| "reward": 0.0715614715591073, | |
| "reward_std": 0.6991735994815826, | |
| "rewards/cosine_scaled_reward": -0.22463593445718288, | |
| "rewards/format_reward": 0.5208333358168602, | |
| "step": 285 | |
| }, | |
| { | |
| "completion_length": 2623.5000610351562, | |
| "epoch": 0.32685714285714285, | |
| "grad_norm": 1.0876595973968506, | |
| "kl": 0.3642578125, | |
| "learning_rate": 5.154764373429315e-07, | |
| "loss": 0.0895, | |
| "reward": 0.7619921118021011, | |
| "reward_std": 1.0285737365484238, | |
| "rewards/cosine_scaled_reward": 0.047662717290222645, | |
| "rewards/format_reward": 0.6666666865348816, | |
| "step": 286 | |
| }, | |
| { | |
| "completion_length": 2762.666748046875, | |
| "epoch": 0.328, | |
| "grad_norm": 0.7187138795852661, | |
| "kl": 0.50048828125, | |
| "learning_rate": 5.123449705004581e-07, | |
| "loss": 0.043, | |
| "reward": 0.5433498155325651, | |
| "reward_std": 0.6913661956787109, | |
| "rewards/cosine_scaled_reward": -0.061658430844545364, | |
| "rewards/format_reward": 0.6666666865348816, | |
| "step": 287 | |
| }, | |
| { | |
| "completion_length": 2270.8333740234375, | |
| "epoch": 0.3291428571428571, | |
| "grad_norm": 0.34955894947052, | |
| "kl": 0.260986328125, | |
| "learning_rate": 5.09215338910999e-07, | |
| "loss": 0.019, | |
| "reward": 0.9035947173833847, | |
| "reward_std": 0.8012775778770447, | |
| "rewards/cosine_scaled_reward": -0.006535984575748444, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 288 | |
| }, | |
| { | |
| "completion_length": 2480.8541870117188, | |
| "epoch": 0.3302857142857143, | |
| "grad_norm": 1.0728695392608643, | |
| "kl": 0.474609375, | |
| "learning_rate": 5.060876951083828e-07, | |
| "loss": 0.0877, | |
| "reward": 0.5563938245177269, | |
| "reward_std": 0.8119515627622604, | |
| "rewards/cosine_scaled_reward": -0.06555308337556198, | |
| "rewards/format_reward": 0.6875000223517418, | |
| "step": 289 | |
| }, | |
| { | |
| "completion_length": 2005.3542175292969, | |
| "epoch": 0.3314285714285714, | |
| "grad_norm": 2.5518229007720947, | |
| "kl": 0.4202880859375, | |
| "learning_rate": 5.02962191529556e-07, | |
| "loss": 0.1377, | |
| "reward": 1.0121518671512604, | |
| "reward_std": 1.0199929028749466, | |
| "rewards/cosine_scaled_reward": 0.14149258099496365, | |
| "rewards/format_reward": 0.7291666865348816, | |
| "step": 290 | |
| }, | |
| { | |
| "completion_length": 1837.25, | |
| "epoch": 0.3325714285714286, | |
| "grad_norm": 0.5082411766052246, | |
| "kl": 0.318115234375, | |
| "learning_rate": 4.998389805071536e-07, | |
| "loss": -0.0025, | |
| "reward": 0.5244562700390816, | |
| "reward_std": 0.8083207458257675, | |
| "rewards/cosine_scaled_reward": -0.09193855058401823, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 291 | |
| }, | |
| { | |
| "completion_length": 2516.000030517578, | |
| "epoch": 0.33371428571428574, | |
| "grad_norm": 0.6963807344436646, | |
| "kl": 0.496826171875, | |
| "learning_rate": 4.967182142620745e-07, | |
| "loss": 0.0554, | |
| "reward": 0.6148294545710087, | |
| "reward_std": 0.7742474526166916, | |
| "rewards/cosine_scaled_reward": -0.025918614119291306, | |
| "rewards/format_reward": 0.666666679084301, | |
| "step": 292 | |
| }, | |
| { | |
| "completion_length": 2563.354248046875, | |
| "epoch": 0.33485714285714285, | |
| "grad_norm": 0.4553970992565155, | |
| "kl": 0.64111328125, | |
| "learning_rate": 4.93600044896063e-07, | |
| "loss": 0.08, | |
| "reward": 0.4226888967677951, | |
| "reward_std": 0.8445644974708557, | |
| "rewards/cosine_scaled_reward": -0.12198889185674489, | |
| "rewards/format_reward": 0.666666679084301, | |
| "step": 293 | |
| }, | |
| { | |
| "completion_length": 2474.6459350585938, | |
| "epoch": 0.336, | |
| "grad_norm": 0.5785382390022278, | |
| "kl": 0.543212890625, | |
| "learning_rate": 4.904846243842949e-07, | |
| "loss": 0.0498, | |
| "reward": 0.7478385232388973, | |
| "reward_std": 0.7380570024251938, | |
| "rewards/cosine_scaled_reward": 0.08225257322192192, | |
| "rewards/format_reward": 0.5833333432674408, | |
| "step": 294 | |
| }, | |
| { | |
| "completion_length": 2818.1043090820312, | |
| "epoch": 0.33714285714285713, | |
| "grad_norm": 1.9920473098754883, | |
| "kl": 1.005859375, | |
| "learning_rate": 4.873721045679706e-07, | |
| "loss": 0.0599, | |
| "reward": 0.38695642724633217, | |
| "reward_std": 0.8360127806663513, | |
| "rewards/cosine_scaled_reward": -0.0461051338352263, | |
| "rewards/format_reward": 0.4791666716337204, | |
| "step": 295 | |
| }, | |
| { | |
| "completion_length": 2180.6875610351562, | |
| "epoch": 0.3382857142857143, | |
| "grad_norm": 1.0185471773147583, | |
| "kl": 0.60888671875, | |
| "learning_rate": 4.842626371469149e-07, | |
| "loss": 0.0929, | |
| "reward": 0.9686335474252701, | |
| "reward_std": 0.9049602597951889, | |
| "rewards/cosine_scaled_reward": 0.1405667569488287, | |
| "rewards/format_reward": 0.6875000223517418, | |
| "step": 296 | |
| }, | |
| { | |
| "completion_length": 2705.2709350585938, | |
| "epoch": 0.3394285714285714, | |
| "grad_norm": 1.4574670791625977, | |
| "kl": 0.7529296875, | |
| "learning_rate": 4.811563736721829e-07, | |
| "loss": 0.0525, | |
| "reward": 0.3473209235817194, | |
| "reward_std": 0.7314907014369965, | |
| "rewards/cosine_scaled_reward": -0.12842286378145218, | |
| "rewards/format_reward": 0.6041666865348816, | |
| "step": 297 | |
| }, | |
| { | |
| "completion_length": 2661.5416870117188, | |
| "epoch": 0.3405714285714286, | |
| "grad_norm": 1.0324411392211914, | |
| "kl": 0.779296875, | |
| "learning_rate": 4.780534655386743e-07, | |
| "loss": 0.0626, | |
| "reward": 0.44023372419178486, | |
| "reward_std": 0.7127360999584198, | |
| "rewards/cosine_scaled_reward": -0.04029981233179569, | |
| "rewards/format_reward": 0.5208333507180214, | |
| "step": 298 | |
| }, | |
| { | |
| "completion_length": 2836.0626220703125, | |
| "epoch": 0.3417142857142857, | |
| "grad_norm": 1.2534230947494507, | |
| "kl": 0.66015625, | |
| "learning_rate": 4.749540639777539e-07, | |
| "loss": 0.0559, | |
| "reward": 0.4187684841454029, | |
| "reward_std": 0.7654632180929184, | |
| "rewards/cosine_scaled_reward": -0.1031157523393631, | |
| "rewards/format_reward": 0.6250000260770321, | |
| "step": 299 | |
| }, | |
| { | |
| "completion_length": 2193.541717529297, | |
| "epoch": 0.34285714285714286, | |
| "grad_norm": 1.023747444152832, | |
| "kl": 0.4393310546875, | |
| "learning_rate": 4.7185832004988133e-07, | |
| "loss": 0.0057, | |
| "reward": 0.7049860581755638, | |
| "reward_std": 0.8015492558479309, | |
| "rewards/cosine_scaled_reward": 0.07124301791191101, | |
| "rewards/format_reward": 0.5625000223517418, | |
| "step": 300 | |
| }, | |
| { | |
| "completion_length": 2034.166748046875, | |
| "epoch": 0.344, | |
| "grad_norm": 1.0728156566619873, | |
| "kl": 0.6123046875, | |
| "learning_rate": 4.68766384637248e-07, | |
| "loss": 0.0087, | |
| "reward": 0.5370926359901205, | |
| "reward_std": 0.8870838582515717, | |
| "rewards/cosine_scaled_reward": -0.05437035672366619, | |
| "rewards/format_reward": 0.645833358168602, | |
| "step": 301 | |
| }, | |
| { | |
| "completion_length": 1496.7708587646484, | |
| "epoch": 0.34514285714285714, | |
| "grad_norm": 0.36257851123809814, | |
| "kl": 0.46044921875, | |
| "learning_rate": 4.656784084364238e-07, | |
| "loss": -0.0228, | |
| "reward": 0.484084477648139, | |
| "reward_std": 0.7823295146226883, | |
| "rewards/cosine_scaled_reward": -0.01837443746626377, | |
| "rewards/format_reward": 0.5208333358168602, | |
| "step": 302 | |
| }, | |
| { | |
| "completion_length": 1376.9167175292969, | |
| "epoch": 0.3462857142857143, | |
| "grad_norm": 0.30551737546920776, | |
| "kl": 0.42236328125, | |
| "learning_rate": 4.6259454195101267e-07, | |
| "loss": -0.0461, | |
| "reward": 0.9217020869255066, | |
| "reward_std": 0.7940811067819595, | |
| "rewards/cosine_scaled_reward": 0.07543436251580715, | |
| "rewards/format_reward": 0.7708333432674408, | |
| "step": 303 | |
| }, | |
| { | |
| "completion_length": 1413.708396911621, | |
| "epoch": 0.3474285714285714, | |
| "grad_norm": 0.9130037426948547, | |
| "kl": 0.74609375, | |
| "learning_rate": 4.59514935484316e-07, | |
| "loss": -0.0368, | |
| "reward": 0.7251628190279007, | |
| "reward_std": 1.0211279392242432, | |
| "rewards/cosine_scaled_reward": 0.05008140648715198, | |
| "rewards/format_reward": 0.6250000074505806, | |
| "step": 304 | |
| }, | |
| { | |
| "completion_length": 1933.5208740234375, | |
| "epoch": 0.3485714285714286, | |
| "grad_norm": 0.6181937456130981, | |
| "kl": 0.59716796875, | |
| "learning_rate": 4.5643973913200837e-07, | |
| "loss": -0.0665, | |
| "reward": 0.6453933482989669, | |
| "reward_std": 0.8129071295261383, | |
| "rewards/cosine_scaled_reward": 0.03103000298142433, | |
| "rewards/format_reward": 0.5833333432674408, | |
| "step": 305 | |
| }, | |
| { | |
| "completion_length": 1331.7917098999023, | |
| "epoch": 0.3497142857142857, | |
| "grad_norm": 0.2622654139995575, | |
| "kl": 0.6375732421875, | |
| "learning_rate": 4.5336910277482155e-07, | |
| "loss": -0.0564, | |
| "reward": 0.4545041471719742, | |
| "reward_std": 0.6556018441915512, | |
| "rewards/cosine_scaled_reward": -0.08524793572723866, | |
| "rewards/format_reward": 0.6250000149011612, | |
| "step": 306 | |
| }, | |
| { | |
| "completion_length": 1522.2916870117188, | |
| "epoch": 0.35085714285714287, | |
| "grad_norm": 0.3843940198421478, | |
| "kl": 0.647705078125, | |
| "learning_rate": 4.503031760712397e-07, | |
| "loss": -0.0408, | |
| "reward": 0.9578620158135891, | |
| "reward_std": 0.9549144953489304, | |
| "rewards/cosine_scaled_reward": 0.15601433627307415, | |
| "rewards/format_reward": 0.645833358168602, | |
| "step": 307 | |
| }, | |
| { | |
| "completion_length": 2036.0834045410156, | |
| "epoch": 0.352, | |
| "grad_norm": 0.8481309413909912, | |
| "kl": 0.606689453125, | |
| "learning_rate": 4.4724210845020494e-07, | |
| "loss": -0.0199, | |
| "reward": 0.631169930100441, | |
| "reward_std": 0.7533179372549057, | |
| "rewards/cosine_scaled_reward": -0.028165025636553764, | |
| "rewards/format_reward": 0.6875000149011612, | |
| "step": 308 | |
| }, | |
| { | |
| "completion_length": 1487.1666870117188, | |
| "epoch": 0.35314285714285715, | |
| "grad_norm": 1.9852585792541504, | |
| "kl": 0.5830078125, | |
| "learning_rate": 4.441860491038345e-07, | |
| "loss": 0.0105, | |
| "reward": 0.7891280353069305, | |
| "reward_std": 0.8583121746778488, | |
| "rewards/cosine_scaled_reward": 0.07164734601974487, | |
| "rewards/format_reward": 0.645833358168602, | |
| "step": 309 | |
| }, | |
| { | |
| "completion_length": 1955.791748046875, | |
| "epoch": 0.35428571428571426, | |
| "grad_norm": 0.31575194001197815, | |
| "kl": 0.184326171875, | |
| "learning_rate": 4.4113514698014953e-07, | |
| "loss": -0.0014, | |
| "reward": 0.8256345121189952, | |
| "reward_std": 0.7062153369188309, | |
| "rewards/cosine_scaled_reward": 0.048233918845653534, | |
| "rewards/format_reward": 0.7291666716337204, | |
| "step": 310 | |
| }, | |
| { | |
| "completion_length": 1666.0833740234375, | |
| "epoch": 0.3554285714285714, | |
| "grad_norm": 2.016129970550537, | |
| "kl": 0.47119140625, | |
| "learning_rate": 4.3808955077581546e-07, | |
| "loss": 0.072, | |
| "reward": 0.8503673672676086, | |
| "reward_std": 0.8861262649297714, | |
| "rewards/cosine_scaled_reward": 0.08143368689343333, | |
| "rewards/format_reward": 0.6875000298023224, | |
| "step": 311 | |
| }, | |
| { | |
| "completion_length": 1778.1041870117188, | |
| "epoch": 0.3565714285714286, | |
| "grad_norm": 2.5336270332336426, | |
| "kl": 0.513916015625, | |
| "learning_rate": 4.350494089288943e-07, | |
| "loss": 0.0693, | |
| "reward": 0.5695639494806528, | |
| "reward_std": 0.7498121336102486, | |
| "rewards/cosine_scaled_reward": -0.038134701550006866, | |
| "rewards/format_reward": 0.645833358168602, | |
| "step": 312 | |
| }, | |
| { | |
| "completion_length": 2122.4791870117188, | |
| "epoch": 0.3577142857142857, | |
| "grad_norm": 0.3355765640735626, | |
| "kl": 0.609619140625, | |
| "learning_rate": 4.3201486961161093e-07, | |
| "loss": -0.0237, | |
| "reward": 0.7382938861846924, | |
| "reward_std": 0.8554851859807968, | |
| "rewards/cosine_scaled_reward": -0.005853069946169853, | |
| "rewards/format_reward": 0.7500000298023224, | |
| "step": 313 | |
| }, | |
| { | |
| "completion_length": 2387.2083435058594, | |
| "epoch": 0.3588571428571429, | |
| "grad_norm": 3.036442756652832, | |
| "kl": 0.231201171875, | |
| "learning_rate": 4.2898608072313045e-07, | |
| "loss": 0.1037, | |
| "reward": 0.8101449112291448, | |
| "reward_std": 0.963694229722023, | |
| "rewards/cosine_scaled_reward": 0.01965576596558094, | |
| "rewards/format_reward": 0.770833358168602, | |
| "step": 314 | |
| }, | |
| { | |
| "completion_length": 2170.729217529297, | |
| "epoch": 0.36, | |
| "grad_norm": 1.4392133951187134, | |
| "kl": 0.21209716796875, | |
| "learning_rate": 4.2596318988235037e-07, | |
| "loss": 0.045, | |
| "reward": 0.6554913818836212, | |
| "reward_std": 1.1266003251075745, | |
| "rewards/cosine_scaled_reward": -0.01600432489067316, | |
| "rewards/format_reward": 0.6875000149011612, | |
| "step": 315 | |
| }, | |
| { | |
| "completion_length": 2317.1459350585938, | |
| "epoch": 0.36114285714285715, | |
| "grad_norm": 0.4884386658668518, | |
| "kl": 0.36376953125, | |
| "learning_rate": 4.2294634442070553e-07, | |
| "loss": 0.0282, | |
| "reward": 0.29845087230205536, | |
| "reward_std": 0.6840033531188965, | |
| "rewards/cosine_scaled_reward": -0.15285790944471955, | |
| "rewards/format_reward": 0.6041666865348816, | |
| "step": 316 | |
| }, | |
| { | |
| "completion_length": 3088.2709350585938, | |
| "epoch": 0.36228571428571427, | |
| "grad_norm": 0.8027182817459106, | |
| "kl": 0.3505859375, | |
| "learning_rate": 4.1993569137498776e-07, | |
| "loss": 0.0242, | |
| "reward": 0.9088336080312729, | |
| "reward_std": 1.000715285539627, | |
| "rewards/cosine_scaled_reward": 0.1002501342445612, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 317 | |
| }, | |
| { | |
| "completion_length": 2317.3750610351562, | |
| "epoch": 0.36342857142857143, | |
| "grad_norm": 0.327318012714386, | |
| "kl": 0.3134765625, | |
| "learning_rate": 4.1693137748017915e-07, | |
| "loss": 0.0385, | |
| "reward": 0.6265020594000816, | |
| "reward_std": 0.7293453440070152, | |
| "rewards/cosine_scaled_reward": -0.040915639605373144, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 318 | |
| }, | |
| { | |
| "completion_length": 2849.3333740234375, | |
| "epoch": 0.36457142857142855, | |
| "grad_norm": 1.7290736436843872, | |
| "kl": 0.443359375, | |
| "learning_rate": 4.1393354916230005e-07, | |
| "loss": 0.098, | |
| "reward": 0.46177836135029793, | |
| "reward_std": 0.9352491050958633, | |
| "rewards/cosine_scaled_reward": -0.07119414396584034, | |
| "rewards/format_reward": 0.604166679084301, | |
| "step": 319 | |
| }, | |
| { | |
| "completion_length": 2402.8750610351562, | |
| "epoch": 0.3657142857142857, | |
| "grad_norm": 1.1702836751937866, | |
| "kl": 0.34814453125, | |
| "learning_rate": 4.1094235253127374e-07, | |
| "loss": 0.0587, | |
| "reward": 0.5764410048723221, | |
| "reward_std": 0.7314303368330002, | |
| "rewards/cosine_scaled_reward": -0.055529496632516384, | |
| "rewards/format_reward": 0.6875000149011612, | |
| "step": 320 | |
| }, | |
| { | |
| "completion_length": 2828.791748046875, | |
| "epoch": 0.3668571428571429, | |
| "grad_norm": 0.797664999961853, | |
| "kl": 0.52001953125, | |
| "learning_rate": 4.079579333738039e-07, | |
| "loss": 0.0412, | |
| "reward": 0.4816475547850132, | |
| "reward_std": 0.8193319886922836, | |
| "rewards/cosine_scaled_reward": -0.050842900411225855, | |
| "rewards/format_reward": 0.5833333432674408, | |
| "step": 321 | |
| }, | |
| { | |
| "completion_length": 2521.479248046875, | |
| "epoch": 0.368, | |
| "grad_norm": 1.1600196361541748, | |
| "kl": 0.3974609375, | |
| "learning_rate": 4.0498043714627006e-07, | |
| "loss": 0.024, | |
| "reward": 0.8539287596940994, | |
| "reward_std": 0.9238015562295914, | |
| "rewards/cosine_scaled_reward": 0.020714368554763496, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 322 | |
| }, | |
| { | |
| "completion_length": 2526.354278564453, | |
| "epoch": 0.36914285714285716, | |
| "grad_norm": 0.7439947128295898, | |
| "kl": 0.40966796875, | |
| "learning_rate": 4.020100089676376e-07, | |
| "loss": 0.0387, | |
| "reward": 0.9395965822041035, | |
| "reward_std": 0.7121690958738327, | |
| "rewards/cosine_scaled_reward": 0.0947982706129551, | |
| "rewards/format_reward": 0.7500000298023224, | |
| "step": 323 | |
| }, | |
| { | |
| "completion_length": 2963.6666870117188, | |
| "epoch": 0.3702857142857143, | |
| "grad_norm": 0.7919374108314514, | |
| "kl": 0.53271484375, | |
| "learning_rate": 3.9904679361238526e-07, | |
| "loss": 0.0574, | |
| "reward": 0.3954196572303772, | |
| "reward_std": 0.7907533347606659, | |
| "rewards/cosine_scaled_reward": -0.0939568355679512, | |
| "rewards/format_reward": 0.583333358168602, | |
| "step": 324 | |
| }, | |
| { | |
| "completion_length": 2059.416748046875, | |
| "epoch": 0.37142857142857144, | |
| "grad_norm": 0.7337906956672668, | |
| "kl": 0.30908203125, | |
| "learning_rate": 3.9609093550344907e-07, | |
| "loss": 0.0437, | |
| "reward": 0.6482492443174124, | |
| "reward_std": 0.976516529917717, | |
| "rewards/cosine_scaled_reward": -0.050875378074124455, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 325 | |
| }, | |
| { | |
| "completion_length": 2717.5001220703125, | |
| "epoch": 0.37257142857142855, | |
| "grad_norm": 0.7754512429237366, | |
| "kl": 0.4609375, | |
| "learning_rate": 3.931425787051832e-07, | |
| "loss": 0.0804, | |
| "reward": 0.5230683460831642, | |
| "reward_std": 0.7168317809700966, | |
| "rewards/cosine_scaled_reward": -0.09263250115327537, | |
| "rewards/format_reward": 0.708333358168602, | |
| "step": 326 | |
| }, | |
| { | |
| "completion_length": 2365.666732788086, | |
| "epoch": 0.3737142857142857, | |
| "grad_norm": 0.9611565470695496, | |
| "kl": 0.370513916015625, | |
| "learning_rate": 3.902018669163384e-07, | |
| "loss": 0.013, | |
| "reward": 0.8529483936727047, | |
| "reward_std": 0.787610650062561, | |
| "rewards/cosine_scaled_reward": 0.05147417262196541, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 327 | |
| }, | |
| { | |
| "completion_length": 2920.5000610351562, | |
| "epoch": 0.37485714285714283, | |
| "grad_norm": 1.1496500968933105, | |
| "kl": 0.568359375, | |
| "learning_rate": 3.872689434630585e-07, | |
| "loss": 0.1313, | |
| "reward": 0.5756548047065735, | |
| "reward_std": 1.1168714761734009, | |
| "rewards/cosine_scaled_reward": -0.04550594184547663, | |
| "rewards/format_reward": 0.6666666865348816, | |
| "step": 328 | |
| }, | |
| { | |
| "completion_length": 2694.229248046875, | |
| "epoch": 0.376, | |
| "grad_norm": 1.6449869871139526, | |
| "kl": 0.4189453125, | |
| "learning_rate": 3.843439512918949e-07, | |
| "loss": 0.0905, | |
| "reward": 0.607914388179779, | |
| "reward_std": 0.9643268138170242, | |
| "rewards/cosine_scaled_reward": -0.0918761616339907, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 329 | |
| }, | |
| { | |
| "completion_length": 2766.041748046875, | |
| "epoch": 0.37714285714285717, | |
| "grad_norm": 0.8693978190422058, | |
| "kl": 0.56396484375, | |
| "learning_rate": 3.8142703296283953e-07, | |
| "loss": 0.0817, | |
| "reward": 0.42995208874344826, | |
| "reward_std": 0.9052233844995499, | |
| "rewards/cosine_scaled_reward": -0.07669062539935112, | |
| "rewards/format_reward": 0.5833333432674408, | |
| "step": 330 | |
| }, | |
| { | |
| "completion_length": 2704.2084350585938, | |
| "epoch": 0.3782857142857143, | |
| "grad_norm": 0.6593329906463623, | |
| "kl": 0.43994140625, | |
| "learning_rate": 3.785183306423767e-07, | |
| "loss": 0.0481, | |
| "reward": 0.5416111797094345, | |
| "reward_std": 0.7576990574598312, | |
| "rewards/cosine_scaled_reward": -0.07294442504644394, | |
| "rewards/format_reward": 0.6875000149011612, | |
| "step": 331 | |
| }, | |
| { | |
| "completion_length": 2430.1458740234375, | |
| "epoch": 0.37942857142857145, | |
| "grad_norm": 1.1451934576034546, | |
| "kl": 0.4638671875, | |
| "learning_rate": 3.7561798609655373e-07, | |
| "loss": 0.0949, | |
| "reward": 0.9672386646270752, | |
| "reward_std": 0.9684969633817673, | |
| "rewards/cosine_scaled_reward": 0.13986931554973125, | |
| "rewards/format_reward": 0.6875000298023224, | |
| "step": 332 | |
| }, | |
| { | |
| "completion_length": 2586.3959045410156, | |
| "epoch": 0.38057142857142856, | |
| "grad_norm": 1.2027528285980225, | |
| "kl": 0.5546875, | |
| "learning_rate": 3.72726140684072e-07, | |
| "loss": 0.0376, | |
| "reward": 0.24384124111384153, | |
| "reward_std": 0.6339670419692993, | |
| "rewards/cosine_scaled_reward": -0.2218293957412243, | |
| "rewards/format_reward": 0.6875000149011612, | |
| "step": 333 | |
| }, | |
| { | |
| "completion_length": 2716.5208740234375, | |
| "epoch": 0.38171428571428573, | |
| "grad_norm": 0.5679751634597778, | |
| "kl": 0.46875, | |
| "learning_rate": 3.6984293534939737e-07, | |
| "loss": 0.0595, | |
| "reward": 0.49158087372779846, | |
| "reward_std": 0.6254527196288109, | |
| "rewards/cosine_scaled_reward": -0.07712622173130512, | |
| "rewards/format_reward": 0.645833358168602, | |
| "step": 334 | |
| }, | |
| { | |
| "completion_length": 2559.4583740234375, | |
| "epoch": 0.38285714285714284, | |
| "grad_norm": 0.4788146913051605, | |
| "kl": 0.447509765625, | |
| "learning_rate": 3.6696851061588994e-07, | |
| "loss": 0.0589, | |
| "reward": 0.47583791986107826, | |
| "reward_std": 0.6539599671959877, | |
| "rewards/cosine_scaled_reward": -0.08499772474169731, | |
| "rewards/format_reward": 0.6458333432674408, | |
| "step": 335 | |
| }, | |
| { | |
| "completion_length": 2945.4584350585938, | |
| "epoch": 0.384, | |
| "grad_norm": 0.6187959313392639, | |
| "kl": 0.59814453125, | |
| "learning_rate": 3.641030065789562e-07, | |
| "loss": 0.1016, | |
| "reward": 0.08771202201023698, | |
| "reward_std": 0.7820224016904831, | |
| "rewards/cosine_scaled_reward": -0.23739399760961533, | |
| "rewards/format_reward": 0.5625000298023224, | |
| "step": 336 | |
| }, | |
| { | |
| "completion_length": 2430.8958740234375, | |
| "epoch": 0.3851428571428571, | |
| "grad_norm": 0.7578234672546387, | |
| "kl": 0.46826171875, | |
| "learning_rate": 3.612465628992203e-07, | |
| "loss": 0.0748, | |
| "reward": 0.5553858801722527, | |
| "reward_std": 0.7994070649147034, | |
| "rewards/cosine_scaled_reward": -0.06605706363916397, | |
| "rewards/format_reward": 0.6875000149011612, | |
| "step": 337 | |
| }, | |
| { | |
| "completion_length": 2227.916717529297, | |
| "epoch": 0.3862857142857143, | |
| "grad_norm": 0.8869759440422058, | |
| "kl": 0.354248046875, | |
| "learning_rate": 3.5839931879571725e-07, | |
| "loss": 0.0831, | |
| "reward": 0.7496502324938774, | |
| "reward_std": 0.8079821169376373, | |
| "rewards/cosine_scaled_reward": -0.0001748921349644661, | |
| "rewards/format_reward": 0.7500000223517418, | |
| "step": 338 | |
| }, | |
| { | |
| "completion_length": 2985.3334350585938, | |
| "epoch": 0.38742857142857146, | |
| "grad_norm": 1.4707542657852173, | |
| "kl": 0.666015625, | |
| "learning_rate": 3.555614130391079e-07, | |
| "loss": 0.1233, | |
| "reward": 0.36759741231799126, | |
| "reward_std": 0.8881158977746964, | |
| "rewards/cosine_scaled_reward": -0.06620129197835922, | |
| "rewards/format_reward": 0.5000000149011612, | |
| "step": 339 | |
| }, | |
| { | |
| "completion_length": 2439.7501220703125, | |
| "epoch": 0.38857142857142857, | |
| "grad_norm": 2.691328287124634, | |
| "kl": 0.453125, | |
| "learning_rate": 3.5273298394491515e-07, | |
| "loss": -0.0493, | |
| "reward": 1.0150221139192581, | |
| "reward_std": 0.9879051297903061, | |
| "rewards/cosine_scaled_reward": 0.11167772859334946, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 340 | |
| }, | |
| { | |
| "completion_length": 2257.937530517578, | |
| "epoch": 0.38971428571428574, | |
| "grad_norm": 0.7236793637275696, | |
| "kl": 0.3848876953125, | |
| "learning_rate": 3.4991416936678276e-07, | |
| "loss": 0.0485, | |
| "reward": 1.5515939444303513, | |
| "reward_std": 0.958163395524025, | |
| "rewards/cosine_scaled_reward": 0.35913030058145523, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 341 | |
| }, | |
| { | |
| "completion_length": 2541.0834045410156, | |
| "epoch": 0.39085714285714285, | |
| "grad_norm": 0.982089102268219, | |
| "kl": 0.48095703125, | |
| "learning_rate": 3.471051066897562e-07, | |
| "loss": 0.0531, | |
| "reward": 0.5335123301483691, | |
| "reward_std": 0.8991846293210983, | |
| "rewards/cosine_scaled_reward": -0.09782716228437494, | |
| "rewards/format_reward": 0.7291666865348816, | |
| "step": 342 | |
| }, | |
| { | |
| "completion_length": 2201.8125915527344, | |
| "epoch": 0.392, | |
| "grad_norm": 3.367811918258667, | |
| "kl": 0.84130859375, | |
| "learning_rate": 3.4430593282358777e-07, | |
| "loss": 0.0659, | |
| "reward": 1.025502122938633, | |
| "reward_std": 0.8074321299791336, | |
| "rewards/cosine_scaled_reward": 0.11691772192716599, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 343 | |
| }, | |
| { | |
| "completion_length": 2793.7501220703125, | |
| "epoch": 0.3931428571428571, | |
| "grad_norm": 0.6109259724617004, | |
| "kl": 0.50537109375, | |
| "learning_rate": 3.4151678419606233e-07, | |
| "loss": 0.0712, | |
| "reward": 0.599671695381403, | |
| "reward_std": 0.8611319363117218, | |
| "rewards/cosine_scaled_reward": -0.04391413927078247, | |
| "rewards/format_reward": 0.6875000149011612, | |
| "step": 344 | |
| }, | |
| { | |
| "completion_length": 2314.166778564453, | |
| "epoch": 0.3942857142857143, | |
| "grad_norm": 0.6686170697212219, | |
| "kl": 0.5712890625, | |
| "learning_rate": 3.387377967463493e-07, | |
| "loss": 0.0395, | |
| "reward": 0.634972408413887, | |
| "reward_std": 0.6707823574542999, | |
| "rewards/cosine_scaled_reward": -0.05751381441950798, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 345 | |
| }, | |
| { | |
| "completion_length": 2902.9584350585938, | |
| "epoch": 0.3954285714285714, | |
| "grad_norm": 0.6067929863929749, | |
| "kl": 0.57958984375, | |
| "learning_rate": 3.359691059183761e-07, | |
| "loss": 0.1087, | |
| "reward": 0.4132253248244524, | |
| "reward_std": 0.8897982537746429, | |
| "rewards/cosine_scaled_reward": -0.05380401201546192, | |
| "rewards/format_reward": 0.5208333507180214, | |
| "step": 346 | |
| }, | |
| { | |
| "completion_length": 1998.2083740234375, | |
| "epoch": 0.3965714285714286, | |
| "grad_norm": 0.9779978394508362, | |
| "kl": 0.2724609375, | |
| "learning_rate": 3.3321084665422803e-07, | |
| "loss": 0.0409, | |
| "reward": 1.1842745244503021, | |
| "reward_std": 1.0255057215690613, | |
| "rewards/cosine_scaled_reward": 0.17547059804201126, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 347 | |
| }, | |
| { | |
| "completion_length": 2504.416748046875, | |
| "epoch": 0.3977142857142857, | |
| "grad_norm": 0.7763749957084656, | |
| "kl": 0.45556640625, | |
| "learning_rate": 3.3046315338757026e-07, | |
| "loss": 0.0586, | |
| "reward": 0.6250789314508438, | |
| "reward_std": 0.745910570025444, | |
| "rewards/cosine_scaled_reward": -0.020793883129954338, | |
| "rewards/format_reward": 0.6666666865348816, | |
| "step": 348 | |
| }, | |
| { | |
| "completion_length": 1897.9375915527344, | |
| "epoch": 0.39885714285714285, | |
| "grad_norm": 0.759898841381073, | |
| "kl": 0.2515869140625, | |
| "learning_rate": 3.2772616003709616e-07, | |
| "loss": -0.0009, | |
| "reward": 1.2401193976402283, | |
| "reward_std": 0.7767119854688644, | |
| "rewards/cosine_scaled_reward": 0.18255970953032374, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 349 | |
| }, | |
| { | |
| "completion_length": 1845.9792175292969, | |
| "epoch": 0.4, | |
| "grad_norm": 0.5678505301475525, | |
| "kl": 0.2552642822265625, | |
| "learning_rate": 3.250000000000001e-07, | |
| "loss": 0.0331, | |
| "reward": 1.1045997142791748, | |
| "reward_std": 0.6993750482797623, | |
| "rewards/cosine_scaled_reward": 0.13563317246735096, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 350 | |
| }, | |
| { | |
| "completion_length": 2162.1250610351562, | |
| "epoch": 0.40114285714285713, | |
| "grad_norm": 0.8248549699783325, | |
| "kl": 0.34246826171875, | |
| "learning_rate": 3.222848061454764e-07, | |
| "loss": 0.0701, | |
| "reward": 0.6730905398726463, | |
| "reward_std": 1.0314117968082428, | |
| "rewards/cosine_scaled_reward": -0.03845473984256387, | |
| "rewards/format_reward": 0.75, | |
| "step": 351 | |
| }, | |
| { | |
| "completion_length": 2398.8750915527344, | |
| "epoch": 0.4022857142857143, | |
| "grad_norm": 0.7086507678031921, | |
| "kl": 0.337158203125, | |
| "learning_rate": 3.195807108082429e-07, | |
| "loss": 0.0512, | |
| "reward": 0.5578571353107691, | |
| "reward_std": 0.8292429894208908, | |
| "rewards/cosine_scaled_reward": -0.10648808628320694, | |
| "rewards/format_reward": 0.770833358168602, | |
| "step": 352 | |
| }, | |
| { | |
| "completion_length": 2411.541778564453, | |
| "epoch": 0.4034285714285714, | |
| "grad_norm": 0.43448832631111145, | |
| "kl": 0.3551025390625, | |
| "learning_rate": 3.168878457820915e-07, | |
| "loss": 0.032, | |
| "reward": 0.7701159529387951, | |
| "reward_std": 0.8441641330718994, | |
| "rewards/cosine_scaled_reward": 0.010057959705591202, | |
| "rewards/format_reward": 0.75, | |
| "step": 353 | |
| }, | |
| { | |
| "completion_length": 2516.8750915527344, | |
| "epoch": 0.4045714285714286, | |
| "grad_norm": 0.47943782806396484, | |
| "kl": 0.382568359375, | |
| "learning_rate": 3.142063423134644e-07, | |
| "loss": 0.0606, | |
| "reward": 0.435189101845026, | |
| "reward_std": 0.6631861850619316, | |
| "rewards/cosine_scaled_reward": -0.13657212257385254, | |
| "rewards/format_reward": 0.708333358168602, | |
| "step": 354 | |
| }, | |
| { | |
| "completion_length": 1538.3125610351562, | |
| "epoch": 0.4057142857142857, | |
| "grad_norm": 0.3774828314781189, | |
| "kl": 0.3017578125, | |
| "learning_rate": 3.115363310950578e-07, | |
| "loss": 0.0368, | |
| "reward": 0.8316129595041275, | |
| "reward_std": 0.5808935090899467, | |
| "rewards/cosine_scaled_reward": -0.021693539805710316, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 355 | |
| }, | |
| { | |
| "completion_length": 2109.8333435058594, | |
| "epoch": 0.40685714285714286, | |
| "grad_norm": 0.3181619346141815, | |
| "kl": 0.30126953125, | |
| "learning_rate": 3.0887794225945143e-07, | |
| "loss": 0.0337, | |
| "reward": 0.5732035748660564, | |
| "reward_std": 0.6602266579866409, | |
| "rewards/cosine_scaled_reward": -0.057148221880197525, | |
| "rewards/format_reward": 0.6875000298023224, | |
| "step": 356 | |
| }, | |
| { | |
| "completion_length": 2442.1458740234375, | |
| "epoch": 0.408, | |
| "grad_norm": 0.8465009927749634, | |
| "kl": 0.5537109375, | |
| "learning_rate": 3.062313053727671e-07, | |
| "loss": 0.0438, | |
| "reward": 0.5404957421123981, | |
| "reward_std": 0.6692793369293213, | |
| "rewards/cosine_scaled_reward": -0.08391880989074707, | |
| "rewards/format_reward": 0.708333358168602, | |
| "step": 357 | |
| }, | |
| { | |
| "completion_length": 2172.5001220703125, | |
| "epoch": 0.40914285714285714, | |
| "grad_norm": 0.5915915966033936, | |
| "kl": 0.2880859375, | |
| "learning_rate": 3.0359654942835247e-07, | |
| "loss": 0.04, | |
| "reward": 0.9776165038347244, | |
| "reward_std": 0.8002345710992813, | |
| "rewards/cosine_scaled_reward": 0.07214158028364182, | |
| "rewards/format_reward": 0.833333358168602, | |
| "step": 358 | |
| }, | |
| { | |
| "completion_length": 1994.7709350585938, | |
| "epoch": 0.4102857142857143, | |
| "grad_norm": 0.5695796608924866, | |
| "kl": 0.33642578125, | |
| "learning_rate": 3.0097380284049523e-07, | |
| "loss": 0.0421, | |
| "reward": 0.5635941876098514, | |
| "reward_std": 0.682354062795639, | |
| "rewards/cosine_scaled_reward": -0.08278624271042645, | |
| "rewards/format_reward": 0.7291666716337204, | |
| "step": 359 | |
| }, | |
| { | |
| "completion_length": 1582.5625305175781, | |
| "epoch": 0.4114285714285714, | |
| "grad_norm": 0.6911218166351318, | |
| "kl": 0.187103271484375, | |
| "learning_rate": 2.9836319343816397e-07, | |
| "loss": 0.038, | |
| "reward": 0.9810230135917664, | |
| "reward_std": 0.6732440888881683, | |
| "rewards/cosine_scaled_reward": 0.03217813931405544, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 360 | |
| }, | |
| { | |
| "completion_length": 1716.8541870117188, | |
| "epoch": 0.4125714285714286, | |
| "grad_norm": 0.755465567111969, | |
| "kl": 0.2716064453125, | |
| "learning_rate": 2.9576484845877793e-07, | |
| "loss": -0.0037, | |
| "reward": 0.4921398665755987, | |
| "reward_std": 0.7469517663121223, | |
| "rewards/cosine_scaled_reward": -0.10809672623872757, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 361 | |
| }, | |
| { | |
| "completion_length": 2381.7708435058594, | |
| "epoch": 0.4137142857142857, | |
| "grad_norm": 0.4649311900138855, | |
| "kl": 0.435546875, | |
| "learning_rate": 2.931788945420058e-07, | |
| "loss": 0.0655, | |
| "reward": 0.3485546410083771, | |
| "reward_std": 0.8100304752588272, | |
| "rewards/cosine_scaled_reward": -0.13822269346565008, | |
| "rewards/format_reward": 0.6250000149011612, | |
| "step": 362 | |
| }, | |
| { | |
| "completion_length": 2278.6876220703125, | |
| "epoch": 0.41485714285714287, | |
| "grad_norm": 0.38487836718559265, | |
| "kl": 0.3544921875, | |
| "learning_rate": 2.9060545772359305e-07, | |
| "loss": 0.0483, | |
| "reward": 0.6228149347007275, | |
| "reward_std": 0.7660052478313446, | |
| "rewards/cosine_scaled_reward": -0.05317586287856102, | |
| "rewards/format_reward": 0.7291666865348816, | |
| "step": 363 | |
| }, | |
| { | |
| "completion_length": 1783.0834045410156, | |
| "epoch": 0.416, | |
| "grad_norm": 0.6700667142868042, | |
| "kl": 0.27978515625, | |
| "learning_rate": 2.8804466342921987e-07, | |
| "loss": 0.006, | |
| "reward": 0.5264641232788563, | |
| "reward_std": 0.7023270279169083, | |
| "rewards/cosine_scaled_reward": -0.12218462734017521, | |
| "rewards/format_reward": 0.7708333432674408, | |
| "step": 364 | |
| }, | |
| { | |
| "completion_length": 1910.2500305175781, | |
| "epoch": 0.41714285714285715, | |
| "grad_norm": 0.7392496466636658, | |
| "kl": 0.290771484375, | |
| "learning_rate": 2.854966364683872e-07, | |
| "loss": 0.0333, | |
| "reward": 0.8516478016972542, | |
| "reward_std": 0.938531182706356, | |
| "rewards/cosine_scaled_reward": 0.009157223626971245, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 365 | |
| }, | |
| { | |
| "completion_length": 2063.8958740234375, | |
| "epoch": 0.41828571428571426, | |
| "grad_norm": 1.9315472841262817, | |
| "kl": 0.2879638671875, | |
| "learning_rate": 2.829615010283344e-07, | |
| "loss": 0.068, | |
| "reward": 0.9369229730218649, | |
| "reward_std": 0.8918980956077576, | |
| "rewards/cosine_scaled_reward": 0.09346149861812592, | |
| "rewards/format_reward": 0.7500000298023224, | |
| "step": 366 | |
| }, | |
| { | |
| "completion_length": 1400.2917175292969, | |
| "epoch": 0.41942857142857143, | |
| "grad_norm": 0.2165093868970871, | |
| "kl": 0.1763916015625, | |
| "learning_rate": 2.8043938066798645e-07, | |
| "loss": 0.0193, | |
| "reward": 0.9957811124622822, | |
| "reward_std": 0.45480820536613464, | |
| "rewards/cosine_scaled_reward": 0.04997388273477554, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 367 | |
| }, | |
| { | |
| "completion_length": 1647.7916717529297, | |
| "epoch": 0.4205714285714286, | |
| "grad_norm": 0.7413077354431152, | |
| "kl": 0.174774169921875, | |
| "learning_rate": 2.7793039831193133e-07, | |
| "loss": -0.0034, | |
| "reward": 0.8528083562850952, | |
| "reward_std": 0.8265992403030396, | |
| "rewards/cosine_scaled_reward": 0.009737495332956314, | |
| "rewards/format_reward": 0.833333358168602, | |
| "step": 368 | |
| }, | |
| { | |
| "completion_length": 1487.3958435058594, | |
| "epoch": 0.4217142857142857, | |
| "grad_norm": 0.6509503722190857, | |
| "kl": 0.12530517578125, | |
| "learning_rate": 2.7543467624442956e-07, | |
| "loss": -0.0257, | |
| "reward": 0.9031364023685455, | |
| "reward_std": 0.9219841361045837, | |
| "rewards/cosine_scaled_reward": 0.03490149416029453, | |
| "rewards/format_reward": 0.833333358168602, | |
| "step": 369 | |
| }, | |
| { | |
| "completion_length": 2323.229248046875, | |
| "epoch": 0.4228571428571429, | |
| "grad_norm": 1.1870368719100952, | |
| "kl": 0.2625732421875, | |
| "learning_rate": 2.729523361034538e-07, | |
| "loss": -0.0417, | |
| "reward": 0.7300510033965111, | |
| "reward_std": 0.8341569006443024, | |
| "rewards/cosine_scaled_reward": -0.051641182973980904, | |
| "rewards/format_reward": 0.833333358168602, | |
| "step": 370 | |
| }, | |
| { | |
| "completion_length": 1885.0834045410156, | |
| "epoch": 0.424, | |
| "grad_norm": 0.3413795232772827, | |
| "kl": 0.232666015625, | |
| "learning_rate": 2.7048349887476037e-07, | |
| "loss": 0.0358, | |
| "reward": 0.33694031462073326, | |
| "reward_std": 0.7036072686314583, | |
| "rewards/cosine_scaled_reward": -0.21694651246070862, | |
| "rewards/format_reward": 0.7708333432674408, | |
| "step": 371 | |
| }, | |
| { | |
| "completion_length": 2071.812530517578, | |
| "epoch": 0.42514285714285716, | |
| "grad_norm": 0.9272376894950867, | |
| "kl": 0.242919921875, | |
| "learning_rate": 2.6802828488599294e-07, | |
| "loss": -0.0016, | |
| "reward": 0.9880311861634254, | |
| "reward_std": 0.629561685025692, | |
| "rewards/cosine_scaled_reward": 0.025265559554100037, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 372 | |
| }, | |
| { | |
| "completion_length": 2372.0834045410156, | |
| "epoch": 0.42628571428571427, | |
| "grad_norm": 0.8849138617515564, | |
| "kl": 0.249755859375, | |
| "learning_rate": 2.655868138008171e-07, | |
| "loss": 0.0062, | |
| "reward": 0.7052676677703857, | |
| "reward_std": 0.6477234065532684, | |
| "rewards/cosine_scaled_reward": -0.07444952987134457, | |
| "rewards/format_reward": 0.8541667014360428, | |
| "step": 373 | |
| }, | |
| { | |
| "completion_length": 2331.8125610351562, | |
| "epoch": 0.42742857142857144, | |
| "grad_norm": 0.5580031275749207, | |
| "kl": 0.309814453125, | |
| "learning_rate": 2.631592046130896e-07, | |
| "loss": 0.0456, | |
| "reward": 0.6995935346931219, | |
| "reward_std": 0.7008600682020187, | |
| "rewards/cosine_scaled_reward": 0.00604674918577075, | |
| "rewards/format_reward": 0.6875000223517418, | |
| "step": 374 | |
| }, | |
| { | |
| "completion_length": 1906.2083740234375, | |
| "epoch": 0.42857142857142855, | |
| "grad_norm": 0.5966392755508423, | |
| "kl": 0.3814697265625, | |
| "learning_rate": 2.6074557564105724e-07, | |
| "loss": 0.0463, | |
| "reward": 0.7689145356416702, | |
| "reward_std": 0.7337282001972198, | |
| "rewards/cosine_scaled_reward": -0.02179272472858429, | |
| "rewards/format_reward": 0.8125000298023224, | |
| "step": 375 | |
| }, | |
| { | |
| "completion_length": 1971.3125915527344, | |
| "epoch": 0.4297142857142857, | |
| "grad_norm": 1.3154016733169556, | |
| "kl": 0.175048828125, | |
| "learning_rate": 2.583460445215911e-07, | |
| "loss": 0.0574, | |
| "reward": 0.968916192650795, | |
| "reward_std": 0.9032018631696701, | |
| "rewards/cosine_scaled_reward": 0.0677914135158062, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 376 | |
| }, | |
| { | |
| "completion_length": 2224.666748046875, | |
| "epoch": 0.4308571428571429, | |
| "grad_norm": 0.892139196395874, | |
| "kl": 0.1807861328125, | |
| "learning_rate": 2.5596072820445254e-07, | |
| "loss": 0.0031, | |
| "reward": 1.009105697274208, | |
| "reward_std": 0.9417294263839722, | |
| "rewards/cosine_scaled_reward": 0.09830283187329769, | |
| "rewards/format_reward": 0.8125000298023224, | |
| "step": 377 | |
| }, | |
| { | |
| "completion_length": 2115.2709045410156, | |
| "epoch": 0.432, | |
| "grad_norm": 0.9765793085098267, | |
| "kl": 0.26611328125, | |
| "learning_rate": 2.5358974294659373e-07, | |
| "loss": 0.0684, | |
| "reward": 0.5737282857298851, | |
| "reward_std": 0.6101915389299393, | |
| "rewards/cosine_scaled_reward": -0.12980252876877785, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 378 | |
| }, | |
| { | |
| "completion_length": 1658.2916870117188, | |
| "epoch": 0.43314285714285716, | |
| "grad_norm": 0.20954985916614532, | |
| "kl": 0.232666015625, | |
| "learning_rate": 2.512332043064913e-07, | |
| "loss": 0.0026, | |
| "reward": 0.6455265134572983, | |
| "reward_std": 0.5983955562114716, | |
| "rewards/cosine_scaled_reward": -0.08348675072193146, | |
| "rewards/format_reward": 0.8125000298023224, | |
| "step": 379 | |
| }, | |
| { | |
| "completion_length": 2212.4375610351562, | |
| "epoch": 0.4342857142857143, | |
| "grad_norm": 1.3722639083862305, | |
| "kl": 0.3023681640625, | |
| "learning_rate": 2.488912271385139e-07, | |
| "loss": 0.0993, | |
| "reward": 0.5304721817374229, | |
| "reward_std": 0.7781679779291153, | |
| "rewards/cosine_scaled_reward": -0.10976393148303032, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 380 | |
| }, | |
| { | |
| "completion_length": 1918.8750305175781, | |
| "epoch": 0.43542857142857144, | |
| "grad_norm": 0.7221528887748718, | |
| "kl": 0.295806884765625, | |
| "learning_rate": 2.465639255873246e-07, | |
| "loss": 0.0029, | |
| "reward": 0.9501378051936626, | |
| "reward_std": 0.6066517308354378, | |
| "rewards/cosine_scaled_reward": 0.047985561192035675, | |
| "rewards/format_reward": 0.8541666865348816, | |
| "step": 381 | |
| }, | |
| { | |
| "completion_length": 1793.6459045410156, | |
| "epoch": 0.43657142857142855, | |
| "grad_norm": 0.25511884689331055, | |
| "kl": 0.2493896484375, | |
| "learning_rate": 2.4425141308231765e-07, | |
| "loss": 0.0226, | |
| "reward": 0.9860572461038828, | |
| "reward_std": 0.6644920855760574, | |
| "rewards/cosine_scaled_reward": 0.08677859604358673, | |
| "rewards/format_reward": 0.8125000298023224, | |
| "step": 382 | |
| }, | |
| { | |
| "completion_length": 1990.2500915527344, | |
| "epoch": 0.4377142857142857, | |
| "grad_norm": 0.4499902129173279, | |
| "kl": 0.25341796875, | |
| "learning_rate": 2.4195380233209006e-07, | |
| "loss": 0.053, | |
| "reward": 0.7591063939034939, | |
| "reward_std": 0.5849988833069801, | |
| "rewards/cosine_scaled_reward": -0.04753013700246811, | |
| "rewards/format_reward": 0.8541667014360428, | |
| "step": 383 | |
| }, | |
| { | |
| "completion_length": 1889.8750610351562, | |
| "epoch": 0.43885714285714283, | |
| "grad_norm": 0.34465470910072327, | |
| "kl": 0.23828125, | |
| "learning_rate": 2.3967120531894857e-07, | |
| "loss": 0.0019, | |
| "reward": 0.626850601285696, | |
| "reward_std": 0.5293265283107758, | |
| "rewards/cosine_scaled_reward": -0.12407470063772053, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 384 | |
| }, | |
| { | |
| "completion_length": 1649.5208587646484, | |
| "epoch": 0.44, | |
| "grad_norm": 1.0988309383392334, | |
| "kl": 0.2174072265625, | |
| "learning_rate": 2.374037332934512e-07, | |
| "loss": 0.046, | |
| "reward": 0.8215210735797882, | |
| "reward_std": 0.7156432569026947, | |
| "rewards/cosine_scaled_reward": -0.005906133679673076, | |
| "rewards/format_reward": 0.833333358168602, | |
| "step": 385 | |
| }, | |
| { | |
| "completion_length": 1869.666748046875, | |
| "epoch": 0.44114285714285717, | |
| "grad_norm": 0.31057262420654297, | |
| "kl": 0.213623046875, | |
| "learning_rate": 2.3515149676898552e-07, | |
| "loss": 0.0312, | |
| "reward": 0.7544382140040398, | |
| "reward_std": 0.5287479311227798, | |
| "rewards/cosine_scaled_reward": -0.09153091069310904, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 386 | |
| }, | |
| { | |
| "completion_length": 2174.000030517578, | |
| "epoch": 0.4422857142857143, | |
| "grad_norm": 0.7334949374198914, | |
| "kl": 0.2723388671875, | |
| "learning_rate": 2.3291460551638237e-07, | |
| "loss": 0.0412, | |
| "reward": 0.7021404728293419, | |
| "reward_std": 0.8102448135614395, | |
| "rewards/cosine_scaled_reward": -0.07601310685276985, | |
| "rewards/format_reward": 0.8541666865348816, | |
| "step": 387 | |
| }, | |
| { | |
| "completion_length": 1764.0625305175781, | |
| "epoch": 0.44342857142857145, | |
| "grad_norm": 0.8506814241409302, | |
| "kl": 0.211181640625, | |
| "learning_rate": 2.306931685585657e-07, | |
| "loss": 0.0326, | |
| "reward": 0.9473480954766273, | |
| "reward_std": 0.7040945738554001, | |
| "rewards/cosine_scaled_reward": 0.025757367722690105, | |
| "rewards/format_reward": 0.895833358168602, | |
| "step": 388 | |
| }, | |
| { | |
| "completion_length": 1558.6875915527344, | |
| "epoch": 0.44457142857142856, | |
| "grad_norm": 1.0051478147506714, | |
| "kl": 0.10626220703125, | |
| "learning_rate": 2.2848729416523859e-07, | |
| "loss": 0.0233, | |
| "reward": 1.2105353027582169, | |
| "reward_std": 0.7370782792568207, | |
| "rewards/cosine_scaled_reward": 0.14693431742489338, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 389 | |
| }, | |
| { | |
| "completion_length": 1673.0417175292969, | |
| "epoch": 0.44571428571428573, | |
| "grad_norm": 1.0045956373214722, | |
| "kl": 0.324462890625, | |
| "learning_rate": 2.2629708984760706e-07, | |
| "loss": -0.0122, | |
| "reward": 0.682011567056179, | |
| "reward_std": 0.668542355298996, | |
| "rewards/cosine_scaled_reward": -0.08607756206765771, | |
| "rewards/format_reward": 0.8541666865348816, | |
| "step": 390 | |
| }, | |
| { | |
| "completion_length": 1759.604248046875, | |
| "epoch": 0.44685714285714284, | |
| "grad_norm": 0.8641379475593567, | |
| "kl": 0.30419921875, | |
| "learning_rate": 2.2412266235313973e-07, | |
| "loss": -0.0151, | |
| "reward": 0.40198634564876556, | |
| "reward_std": 0.4891185835003853, | |
| "rewards/cosine_scaled_reward": -0.23650683648884296, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 391 | |
| }, | |
| { | |
| "completion_length": 1997.0208740234375, | |
| "epoch": 0.448, | |
| "grad_norm": 0.601497232913971, | |
| "kl": 0.3251953125, | |
| "learning_rate": 2.2196411766036487e-07, | |
| "loss": 0.0246, | |
| "reward": 1.31626558303833, | |
| "reward_std": 0.8470017611980438, | |
| "rewards/cosine_scaled_reward": 0.2206327999010682, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 392 | |
| }, | |
| { | |
| "completion_length": 1767.3958740234375, | |
| "epoch": 0.4491428571428571, | |
| "grad_norm": 0.9790117740631104, | |
| "kl": 0.20623779296875, | |
| "learning_rate": 2.1982156097370557e-07, | |
| "loss": 0.0716, | |
| "reward": 1.0628649685531855, | |
| "reward_std": 0.7842252627015114, | |
| "rewards/cosine_scaled_reward": 0.09393247216939926, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 393 | |
| }, | |
| { | |
| "completion_length": 2281.5625610351562, | |
| "epoch": 0.4502857142857143, | |
| "grad_norm": 0.9092360138893127, | |
| "kl": 0.2666015625, | |
| "learning_rate": 2.1769509671835223e-07, | |
| "loss": 0.0071, | |
| "reward": 0.7091562300920486, | |
| "reward_std": 0.6370756179094315, | |
| "rewards/cosine_scaled_reward": -0.09333855286240578, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 394 | |
| }, | |
| { | |
| "completion_length": 2072.7083740234375, | |
| "epoch": 0.4514285714285714, | |
| "grad_norm": 0.6948179006576538, | |
| "kl": 0.335205078125, | |
| "learning_rate": 2.1558482853517253e-07, | |
| "loss": 0.0399, | |
| "reward": 0.6186719592660666, | |
| "reward_std": 0.8180225193500519, | |
| "rewards/cosine_scaled_reward": -0.06566403433680534, | |
| "rewards/format_reward": 0.7500000298023224, | |
| "step": 395 | |
| }, | |
| { | |
| "completion_length": 1713.0625305175781, | |
| "epoch": 0.45257142857142857, | |
| "grad_norm": 1.03392493724823, | |
| "kl": 0.2850341796875, | |
| "learning_rate": 2.134908592756607e-07, | |
| "loss": 0.0576, | |
| "reward": 0.6681124269962311, | |
| "reward_std": 0.72493577003479, | |
| "rewards/cosine_scaled_reward": -0.07219376973807812, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 396 | |
| }, | |
| { | |
| "completion_length": 2008.166748046875, | |
| "epoch": 0.45371428571428574, | |
| "grad_norm": 1.2174099683761597, | |
| "kl": 0.3359375, | |
| "learning_rate": 2.1141329099692406e-07, | |
| "loss": 0.0821, | |
| "reward": 1.3461299315094948, | |
| "reward_std": 0.8196755945682526, | |
| "rewards/cosine_scaled_reward": 0.2668149508535862, | |
| "rewards/format_reward": 0.8125000298023224, | |
| "step": 397 | |
| }, | |
| { | |
| "completion_length": 1758.4167175292969, | |
| "epoch": 0.45485714285714285, | |
| "grad_norm": 0.7967256307601929, | |
| "kl": 0.3011474609375, | |
| "learning_rate": 2.0935222495670968e-07, | |
| "loss": 0.0175, | |
| "reward": 1.0533079504966736, | |
| "reward_std": 0.9479693919420242, | |
| "rewards/cosine_scaled_reward": 0.057903981767594814, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 398 | |
| }, | |
| { | |
| "completion_length": 2110.0000610351562, | |
| "epoch": 0.456, | |
| "grad_norm": 0.6236258149147034, | |
| "kl": 0.3653564453125, | |
| "learning_rate": 2.0730776160846853e-07, | |
| "loss": 0.0552, | |
| "reward": 0.8325799964368343, | |
| "reward_std": 0.6572683453559875, | |
| "rewards/cosine_scaled_reward": -0.00037669437006115913, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 399 | |
| }, | |
| { | |
| "completion_length": 1693.8333587646484, | |
| "epoch": 0.45714285714285713, | |
| "grad_norm": 0.5594977736473083, | |
| "kl": 0.239166259765625, | |
| "learning_rate": 2.0528000059645995e-07, | |
| "loss": 0.0092, | |
| "reward": 0.5645224675536156, | |
| "reward_std": 0.47261467576026917, | |
| "rewards/cosine_scaled_reward": -0.18648880254477262, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 400 | |
| }, | |
| { | |
| "completion_length": 2298.4375610351562, | |
| "epoch": 0.4582857142857143, | |
| "grad_norm": 0.46592381596565247, | |
| "kl": 0.5498046875, | |
| "learning_rate": 2.032690407508949e-07, | |
| "loss": 0.0651, | |
| "reward": 0.7146447077393532, | |
| "reward_std": 0.9194528758525848, | |
| "rewards/cosine_scaled_reward": -0.05934431403875351, | |
| "rewards/format_reward": 0.833333358168602, | |
| "step": 401 | |
| }, | |
| { | |
| "completion_length": 2858.3959350585938, | |
| "epoch": 0.4594285714285714, | |
| "grad_norm": 1.3920950889587402, | |
| "kl": 0.701171875, | |
| "learning_rate": 2.0127498008311922e-07, | |
| "loss": 0.0681, | |
| "reward": 0.38334885984659195, | |
| "reward_std": 0.6373907253146172, | |
| "rewards/cosine_scaled_reward": -0.20415889844298363, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 402 | |
| }, | |
| { | |
| "completion_length": 2565.5626220703125, | |
| "epoch": 0.4605714285714286, | |
| "grad_norm": 1.1024017333984375, | |
| "kl": 0.625, | |
| "learning_rate": 1.9929791578083655e-07, | |
| "loss": 0.029, | |
| "reward": 0.801287055015564, | |
| "reward_std": 0.897977739572525, | |
| "rewards/cosine_scaled_reward": -0.03685649996623397, | |
| "rewards/format_reward": 0.8750000298023224, | |
| "step": 403 | |
| }, | |
| { | |
| "completion_length": 2505.916748046875, | |
| "epoch": 0.4617142857142857, | |
| "grad_norm": 1.409442663192749, | |
| "kl": 0.65576171875, | |
| "learning_rate": 1.9733794420337213e-07, | |
| "loss": 0.0304, | |
| "reward": 1.2360095381736755, | |
| "reward_std": 0.7143290638923645, | |
| "rewards/cosine_scaled_reward": 0.18050476163625717, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 404 | |
| }, | |
| { | |
| "completion_length": 2441.0000610351562, | |
| "epoch": 0.46285714285714286, | |
| "grad_norm": 0.8860685229301453, | |
| "kl": 0.64306640625, | |
| "learning_rate": 1.9539516087697517e-07, | |
| "loss": 0.0652, | |
| "reward": 1.0503446012735367, | |
| "reward_std": 0.8782050907611847, | |
| "rewards/cosine_scaled_reward": 0.10850561456754804, | |
| "rewards/format_reward": 0.833333358168602, | |
| "step": 405 | |
| }, | |
| { | |
| "completion_length": 2316.562530517578, | |
| "epoch": 0.464, | |
| "grad_norm": 0.9385198354721069, | |
| "kl": 0.6611328125, | |
| "learning_rate": 1.934696604901642e-07, | |
| "loss": 0.039, | |
| "reward": 0.8388771619647741, | |
| "reward_std": 0.5718994289636612, | |
| "rewards/cosine_scaled_reward": -0.007644776254892349, | |
| "rewards/format_reward": 0.8541666865348816, | |
| "step": 406 | |
| }, | |
| { | |
| "completion_length": 2314.6459045410156, | |
| "epoch": 0.46514285714285714, | |
| "grad_norm": 1.216766357421875, | |
| "kl": 0.55029296875, | |
| "learning_rate": 1.915615368891117e-07, | |
| "loss": 0.0239, | |
| "reward": 0.8419212326407433, | |
| "reward_std": 0.65188068151474, | |
| "rewards/cosine_scaled_reward": -0.037372760474681854, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 407 | |
| }, | |
| { | |
| "completion_length": 2388.791717529297, | |
| "epoch": 0.4662857142857143, | |
| "grad_norm": 0.6723232865333557, | |
| "kl": 0.4609375, | |
| "learning_rate": 1.8967088307307e-07, | |
| "loss": 0.048, | |
| "reward": 1.100903958082199, | |
| "reward_std": 0.7514118552207947, | |
| "rewards/cosine_scaled_reward": 0.10253530507907271, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 408 | |
| }, | |
| { | |
| "completion_length": 2204.958465576172, | |
| "epoch": 0.4674285714285714, | |
| "grad_norm": 0.9829697012901306, | |
| "kl": 0.53759765625, | |
| "learning_rate": 1.8779779118983867e-07, | |
| "loss": 0.0237, | |
| "reward": 1.0897281467914581, | |
| "reward_std": 0.4026891812682152, | |
| "rewards/cosine_scaled_reward": 0.10736404359340668, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 409 | |
| }, | |
| { | |
| "completion_length": 2508.729248046875, | |
| "epoch": 0.4685714285714286, | |
| "grad_norm": 1.1136001348495483, | |
| "kl": 0.58837890625, | |
| "learning_rate": 1.8594235253127372e-07, | |
| "loss": 0.0388, | |
| "reward": 0.4441644148901105, | |
| "reward_std": 0.8706175982952118, | |
| "rewards/cosine_scaled_reward": -0.1425011307001114, | |
| "rewards/format_reward": 0.7291666865348816, | |
| "step": 410 | |
| }, | |
| { | |
| "completion_length": 2758.3543090820312, | |
| "epoch": 0.4697142857142857, | |
| "grad_norm": 1.1172066926956177, | |
| "kl": 0.52685546875, | |
| "learning_rate": 1.8410465752883758e-07, | |
| "loss": 0.0214, | |
| "reward": 0.38340113312005997, | |
| "reward_std": 0.6312393695116043, | |
| "rewards/cosine_scaled_reward": -0.235382791608572, | |
| "rewards/format_reward": 0.8541667014360428, | |
| "step": 411 | |
| }, | |
| { | |
| "completion_length": 2535.416717529297, | |
| "epoch": 0.47085714285714286, | |
| "grad_norm": 1.1818182468414307, | |
| "kl": 0.579833984375, | |
| "learning_rate": 1.822847957491922e-07, | |
| "loss": 0.028, | |
| "reward": 0.8752952516078949, | |
| "reward_std": 0.5417208820581436, | |
| "rewards/cosine_scaled_reward": 0.02098093181848526, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 412 | |
| }, | |
| { | |
| "completion_length": 2696.1250610351562, | |
| "epoch": 0.472, | |
| "grad_norm": 0.5541598796844482, | |
| "kl": 0.5654296875, | |
| "learning_rate": 1.804828558898332e-07, | |
| "loss": 0.0783, | |
| "reward": 0.497568441554904, | |
| "reward_std": 0.7255310416221619, | |
| "rewards/cosine_scaled_reward": -0.10538244433701038, | |
| "rewards/format_reward": 0.708333358168602, | |
| "step": 413 | |
| }, | |
| { | |
| "completion_length": 2673.8751220703125, | |
| "epoch": 0.47314285714285714, | |
| "grad_norm": 0.9568617343902588, | |
| "kl": 0.53369140625, | |
| "learning_rate": 1.7869892577476722e-07, | |
| "loss": 0.0999, | |
| "reward": 0.9199014604091644, | |
| "reward_std": 0.8385901600122452, | |
| "rewards/cosine_scaled_reward": 0.053700722055509686, | |
| "rewards/format_reward": 0.8125000298023224, | |
| "step": 414 | |
| }, | |
| { | |
| "completion_length": 2954.5418090820312, | |
| "epoch": 0.4742857142857143, | |
| "grad_norm": 1.3337595462799072, | |
| "kl": 0.607421875, | |
| "learning_rate": 1.7693309235023127e-07, | |
| "loss": 0.0298, | |
| "reward": 0.843063585460186, | |
| "reward_std": 0.9124226570129395, | |
| "rewards/cosine_scaled_reward": 0.0048651136457920074, | |
| "rewards/format_reward": 0.833333358168602, | |
| "step": 415 | |
| }, | |
| { | |
| "completion_length": 2910.4375610351562, | |
| "epoch": 0.4754285714285714, | |
| "grad_norm": 0.6592503786087036, | |
| "kl": 0.61865234375, | |
| "learning_rate": 1.7518544168045524e-07, | |
| "loss": 0.0733, | |
| "reward": 0.46045139618217945, | |
| "reward_std": 0.8773138746619225, | |
| "rewards/cosine_scaled_reward": -0.10310766100883484, | |
| "rewards/format_reward": 0.6666666865348816, | |
| "step": 416 | |
| }, | |
| { | |
| "completion_length": 2641.2918090820312, | |
| "epoch": 0.4765714285714286, | |
| "grad_norm": 0.829136073589325, | |
| "kl": 0.49462890625, | |
| "learning_rate": 1.7345605894346726e-07, | |
| "loss": 0.057, | |
| "reward": 0.9838578663766384, | |
| "reward_std": 0.7910896837711334, | |
| "rewards/cosine_scaled_reward": 0.054428933188319206, | |
| "rewards/format_reward": 0.8750000298023224, | |
| "step": 417 | |
| }, | |
| { | |
| "completion_length": 2964.3541870117188, | |
| "epoch": 0.4777142857142857, | |
| "grad_norm": 0.9262496829032898, | |
| "kl": 0.5478515625, | |
| "learning_rate": 1.7174502842694212e-07, | |
| "loss": 0.0357, | |
| "reward": 0.6633618324995041, | |
| "reward_std": 0.6466763466596603, | |
| "rewards/cosine_scaled_reward": -0.10581910982728004, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 418 | |
| }, | |
| { | |
| "completion_length": 2773.5626220703125, | |
| "epoch": 0.47885714285714287, | |
| "grad_norm": 0.8558900952339172, | |
| "kl": 0.49072265625, | |
| "learning_rate": 1.7005243352409333e-07, | |
| "loss": 0.0542, | |
| "reward": 0.6305762082338333, | |
| "reward_std": 0.7357209548354149, | |
| "rewards/cosine_scaled_reward": -0.080545240547508, | |
| "rewards/format_reward": 0.7916666716337204, | |
| "step": 419 | |
| }, | |
| { | |
| "completion_length": 2345.8959045410156, | |
| "epoch": 0.48, | |
| "grad_norm": 0.6529119610786438, | |
| "kl": 0.3431396484375, | |
| "learning_rate": 1.6837835672960831e-07, | |
| "loss": 0.0226, | |
| "reward": 1.2573866918683052, | |
| "reward_std": 0.9116456806659698, | |
| "rewards/cosine_scaled_reward": 0.1911933235824108, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 420 | |
| }, | |
| { | |
| "completion_length": 2318.1875915527344, | |
| "epoch": 0.48114285714285715, | |
| "grad_norm": 0.6412160396575928, | |
| "kl": 0.35498046875, | |
| "learning_rate": 1.6672287963562852e-07, | |
| "loss": 0.0124, | |
| "reward": 1.0443747788667679, | |
| "reward_std": 0.7097911983728409, | |
| "rewards/cosine_scaled_reward": 0.09510404244065285, | |
| "rewards/format_reward": 0.8541666865348816, | |
| "step": 421 | |
| }, | |
| { | |
| "completion_length": 2448.5833740234375, | |
| "epoch": 0.48228571428571426, | |
| "grad_norm": 0.6165621280670166, | |
| "kl": 0.421875, | |
| "learning_rate": 1.6508608292777203e-07, | |
| "loss": 0.0385, | |
| "reward": 0.7055833786725998, | |
| "reward_std": 0.7713779509067535, | |
| "rewards/cosine_scaled_reward": -0.053458321839571, | |
| "rewards/format_reward": 0.8125000298023224, | |
| "step": 422 | |
| }, | |
| { | |
| "completion_length": 2370.479278564453, | |
| "epoch": 0.48342857142857143, | |
| "grad_norm": 1.0260326862335205, | |
| "kl": 0.325927734375, | |
| "learning_rate": 1.6346804638120098e-07, | |
| "loss": 0.0657, | |
| "reward": 0.8030254691839218, | |
| "reward_std": 0.8349241316318512, | |
| "rewards/cosine_scaled_reward": -0.015153962187469006, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 423 | |
| }, | |
| { | |
| "completion_length": 2863.0833740234375, | |
| "epoch": 0.4845714285714286, | |
| "grad_norm": 0.8439249396324158, | |
| "kl": 0.43115234375, | |
| "learning_rate": 1.6186884885673413e-07, | |
| "loss": 0.0713, | |
| "reward": 0.4908841624855995, | |
| "reward_std": 0.8119627386331558, | |
| "rewards/cosine_scaled_reward": -0.15039126574993134, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 424 | |
| }, | |
| { | |
| "completion_length": 2920.604248046875, | |
| "epoch": 0.4857142857142857, | |
| "grad_norm": 0.7168906927108765, | |
| "kl": 0.455078125, | |
| "learning_rate": 1.6028856829700258e-07, | |
| "loss": 0.0577, | |
| "reward": 0.8773088157176971, | |
| "reward_std": 0.8730379045009613, | |
| "rewards/cosine_scaled_reward": 0.032404396682977676, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 425 | |
| }, | |
| { | |
| "completion_length": 2877.354248046875, | |
| "epoch": 0.4868571428571429, | |
| "grad_norm": 0.7351894974708557, | |
| "kl": 0.3916015625, | |
| "learning_rate": 1.5872728172265146e-07, | |
| "loss": 0.0252, | |
| "reward": 1.0884526520967484, | |
| "reward_std": 0.8330738395452499, | |
| "rewards/cosine_scaled_reward": 0.10672629997134209, | |
| "rewards/format_reward": 0.8750000298023224, | |
| "step": 426 | |
| }, | |
| { | |
| "completion_length": 2745.041748046875, | |
| "epoch": 0.488, | |
| "grad_norm": 0.4892515242099762, | |
| "kl": 0.33447265625, | |
| "learning_rate": 1.5718506522858572e-07, | |
| "loss": 0.0358, | |
| "reward": 1.0718627832829952, | |
| "reward_std": 0.7832525819540024, | |
| "rewards/cosine_scaled_reward": 0.11926471255719662, | |
| "rewards/format_reward": 0.833333358168602, | |
| "step": 427 | |
| }, | |
| { | |
| "completion_length": 2932.1458740234375, | |
| "epoch": 0.48914285714285716, | |
| "grad_norm": 1.292845606803894, | |
| "kl": 0.52685546875, | |
| "learning_rate": 1.5566199398026147e-07, | |
| "loss": 0.0264, | |
| "reward": 0.31675857678055763, | |
| "reward_std": 0.5401652418076992, | |
| "rewards/cosine_scaled_reward": -0.21662072464823723, | |
| "rewards/format_reward": 0.7500000074505806, | |
| "step": 428 | |
| }, | |
| { | |
| "completion_length": 2592.3334045410156, | |
| "epoch": 0.49028571428571427, | |
| "grad_norm": 0.6887741088867188, | |
| "kl": 0.39111328125, | |
| "learning_rate": 1.5415814221002265e-07, | |
| "loss": 0.0134, | |
| "reward": 0.8551270663738251, | |
| "reward_std": 0.883497804403305, | |
| "rewards/cosine_scaled_reward": -0.02035313844680786, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 429 | |
| }, | |
| { | |
| "completion_length": 2997.7709350585938, | |
| "epoch": 0.49142857142857144, | |
| "grad_norm": 0.9550595283508301, | |
| "kl": 0.4248046875, | |
| "learning_rate": 1.5267358321348285e-07, | |
| "loss": 0.0977, | |
| "reward": 0.7853763314778917, | |
| "reward_std": 0.862298920750618, | |
| "rewards/cosine_scaled_reward": -0.013561863452196121, | |
| "rewards/format_reward": 0.8125000298023224, | |
| "step": 430 | |
| }, | |
| { | |
| "completion_length": 2814.6459350585938, | |
| "epoch": 0.49257142857142855, | |
| "grad_norm": 0.35693833231925964, | |
| "kl": 0.42578125, | |
| "learning_rate": 1.5120838934595337e-07, | |
| "loss": 0.0465, | |
| "reward": 0.7550955265760422, | |
| "reward_std": 0.797643780708313, | |
| "rewards/cosine_scaled_reward": -0.01828559674322605, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 431 | |
| }, | |
| { | |
| "completion_length": 2680.0833740234375, | |
| "epoch": 0.4937142857142857, | |
| "grad_norm": 0.3660014867782593, | |
| "kl": 0.42138671875, | |
| "learning_rate": 1.4976263201891613e-07, | |
| "loss": 0.0453, | |
| "reward": 0.5384078100323677, | |
| "reward_std": 0.6302113831043243, | |
| "rewards/cosine_scaled_reward": -0.11621277220547199, | |
| "rewards/format_reward": 0.7708333432674408, | |
| "step": 432 | |
| }, | |
| { | |
| "completion_length": 3197.0625610351562, | |
| "epoch": 0.4948571428571429, | |
| "grad_norm": 0.834852397441864, | |
| "kl": 0.45703125, | |
| "learning_rate": 1.483363816965435e-07, | |
| "loss": 0.0415, | |
| "reward": 0.677655503153801, | |
| "reward_std": 0.997919499874115, | |
| "rewards/cosine_scaled_reward": -0.015338926576077938, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 433 | |
| }, | |
| { | |
| "completion_length": 2018.541748046875, | |
| "epoch": 0.496, | |
| "grad_norm": 0.3951985836029053, | |
| "kl": 0.17779541015625, | |
| "learning_rate": 1.469297078922642e-07, | |
| "loss": -0.0128, | |
| "reward": 1.5104268491268158, | |
| "reward_std": 0.6382196992635727, | |
| "rewards/cosine_scaled_reward": 0.2760467454791069, | |
| "rewards/format_reward": 0.9583333432674408, | |
| "step": 434 | |
| }, | |
| { | |
| "completion_length": 2781.45849609375, | |
| "epoch": 0.49714285714285716, | |
| "grad_norm": 0.8080605268478394, | |
| "kl": 0.41552734375, | |
| "learning_rate": 1.4554267916537495e-07, | |
| "loss": 0.072, | |
| "reward": 0.5199687406420708, | |
| "reward_std": 0.697292298078537, | |
| "rewards/cosine_scaled_reward": -0.11501563712954521, | |
| "rewards/format_reward": 0.7500000298023224, | |
| "step": 435 | |
| }, | |
| { | |
| "completion_length": 2910.9168090820312, | |
| "epoch": 0.4982857142857143, | |
| "grad_norm": 1.0082898139953613, | |
| "kl": 0.31591796875, | |
| "learning_rate": 1.4417536311769885e-07, | |
| "loss": 0.0441, | |
| "reward": 1.0633302181959152, | |
| "reward_std": 0.8466629385948181, | |
| "rewards/cosine_scaled_reward": 0.06291508674621582, | |
| "rewards/format_reward": 0.9375000149011612, | |
| "step": 436 | |
| }, | |
| { | |
| "completion_length": 2581.2500610351562, | |
| "epoch": 0.49942857142857144, | |
| "grad_norm": 0.5378354787826538, | |
| "kl": 0.2705078125, | |
| "learning_rate": 1.4282782639029128e-07, | |
| "loss": 0.003, | |
| "reward": 1.0827649384737015, | |
| "reward_std": 0.822308674454689, | |
| "rewards/cosine_scaled_reward": 0.10388245154172182, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 437 | |
| }, | |
| { | |
| "completion_length": 2723.1251220703125, | |
| "epoch": 0.5005714285714286, | |
| "grad_norm": 0.6586508750915527, | |
| "kl": 0.340087890625, | |
| "learning_rate": 1.4150013466019114e-07, | |
| "loss": 0.046, | |
| "reward": 1.0363626778125763, | |
| "reward_std": 0.9988095015287399, | |
| "rewards/cosine_scaled_reward": 0.10151464305818081, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 438 | |
| }, | |
| { | |
| "completion_length": 2458.041778564453, | |
| "epoch": 0.5017142857142857, | |
| "grad_norm": 0.6118423342704773, | |
| "kl": 0.3319091796875, | |
| "learning_rate": 1.4019235263722034e-07, | |
| "loss": 0.059, | |
| "reward": 0.7599635235965252, | |
| "reward_std": 0.6979039385914803, | |
| "rewards/cosine_scaled_reward": -0.05751825252082199, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 439 | |
| }, | |
| { | |
| "completion_length": 2812.7916870117188, | |
| "epoch": 0.5028571428571429, | |
| "grad_norm": 0.6263717412948608, | |
| "kl": 0.346435546875, | |
| "learning_rate": 1.3890454406082956e-07, | |
| "loss": 0.0402, | |
| "reward": 0.7473399192094803, | |
| "reward_std": 0.7950000017881393, | |
| "rewards/cosine_scaled_reward": -0.04299671063199639, | |
| "rewards/format_reward": 0.833333358168602, | |
| "step": 440 | |
| }, | |
| { | |
| "completion_length": 2658.854217529297, | |
| "epoch": 0.504, | |
| "grad_norm": 0.48751676082611084, | |
| "kl": 0.3270263671875, | |
| "learning_rate": 1.3763677169699217e-07, | |
| "loss": 0.0419, | |
| "reward": 0.7070795819163322, | |
| "reward_std": 0.773023784160614, | |
| "rewards/cosine_scaled_reward": -0.04229356348514557, | |
| "rewards/format_reward": 0.7916666716337204, | |
| "step": 441 | |
| }, | |
| { | |
| "completion_length": 2351.6250610351562, | |
| "epoch": 0.5051428571428571, | |
| "grad_norm": 0.5668932199478149, | |
| "kl": 0.252685546875, | |
| "learning_rate": 1.3638909733514452e-07, | |
| "loss": 0.0475, | |
| "reward": 0.8659966886043549, | |
| "reward_std": 0.5813730582594872, | |
| "rewards/cosine_scaled_reward": 0.016331655904650688, | |
| "rewards/format_reward": 0.833333358168602, | |
| "step": 442 | |
| }, | |
| { | |
| "completion_length": 2956.729248046875, | |
| "epoch": 0.5062857142857143, | |
| "grad_norm": 0.3870391249656677, | |
| "kl": 0.30859375, | |
| "learning_rate": 1.351615817851748e-07, | |
| "loss": 0.0416, | |
| "reward": 1.151278093457222, | |
| "reward_std": 0.8103004992008209, | |
| "rewards/cosine_scaled_reward": 0.11730570159852505, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 443 | |
| }, | |
| { | |
| "completion_length": 2814.6876220703125, | |
| "epoch": 0.5074285714285715, | |
| "grad_norm": 0.5548789501190186, | |
| "kl": 0.369140625, | |
| "learning_rate": 1.3395428487445914e-07, | |
| "loss": 0.0343, | |
| "reward": 0.9690770208835602, | |
| "reward_std": 0.9044716209173203, | |
| "rewards/cosine_scaled_reward": 0.09912180341780186, | |
| "rewards/format_reward": 0.770833358168602, | |
| "step": 444 | |
| }, | |
| { | |
| "completion_length": 2858.6875610351562, | |
| "epoch": 0.5085714285714286, | |
| "grad_norm": 0.7488447427749634, | |
| "kl": 0.3701171875, | |
| "learning_rate": 1.3276726544494571e-07, | |
| "loss": 0.0152, | |
| "reward": 0.6591560812667012, | |
| "reward_std": 0.6855928599834442, | |
| "rewards/cosine_scaled_reward": -0.035005307756364346, | |
| "rewards/format_reward": 0.7291666865348816, | |
| "step": 445 | |
| }, | |
| { | |
| "completion_length": 2472.7500610351562, | |
| "epoch": 0.5097142857142857, | |
| "grad_norm": 0.5907102227210999, | |
| "kl": 0.208251953125, | |
| "learning_rate": 1.316005813502869e-07, | |
| "loss": 0.0325, | |
| "reward": 1.3291829228401184, | |
| "reward_std": 0.7747218981385231, | |
| "rewards/cosine_scaled_reward": 0.206258125603199, | |
| "rewards/format_reward": 0.9166666716337204, | |
| "step": 446 | |
| }, | |
| { | |
| "completion_length": 2428.1459350585938, | |
| "epoch": 0.5108571428571429, | |
| "grad_norm": 0.5603023171424866, | |
| "kl": 0.2802734375, | |
| "learning_rate": 1.3045428945301953e-07, | |
| "loss": 0.0368, | |
| "reward": 0.9525867849588394, | |
| "reward_std": 0.712784081697464, | |
| "rewards/cosine_scaled_reward": 0.038793399930000305, | |
| "rewards/format_reward": 0.875, | |
| "step": 447 | |
| }, | |
| { | |
| "completion_length": 2589.3958740234375, | |
| "epoch": 0.512, | |
| "grad_norm": 0.9914929866790771, | |
| "kl": 0.297607421875, | |
| "learning_rate": 1.2932844562179352e-07, | |
| "loss": 0.0567, | |
| "reward": 1.3133542239665985, | |
| "reward_std": 1.0432665199041367, | |
| "rewards/cosine_scaled_reward": 0.27126041799783707, | |
| "rewards/format_reward": 0.770833358168602, | |
| "step": 448 | |
| }, | |
| { | |
| "completion_length": 2799.166748046875, | |
| "epoch": 0.5131428571428571, | |
| "grad_norm": 1.0846092700958252, | |
| "kl": 0.4091796875, | |
| "learning_rate": 1.2822310472864885e-07, | |
| "loss": 0.0606, | |
| "reward": 0.9647302851080894, | |
| "reward_std": 0.7462186589837074, | |
| "rewards/cosine_scaled_reward": 0.10736512392759323, | |
| "rewards/format_reward": 0.7500000298023224, | |
| "step": 449 | |
| }, | |
| { | |
| "completion_length": 2719.916748046875, | |
| "epoch": 0.5142857142857142, | |
| "grad_norm": 0.5918545126914978, | |
| "kl": 0.3916015625, | |
| "learning_rate": 1.2713832064634125e-07, | |
| "loss": 0.014, | |
| "reward": 1.0999898612499237, | |
| "reward_std": 0.8317281156778336, | |
| "rewards/cosine_scaled_reward": 0.15416158083826303, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 450 | |
| }, | |
| { | |
| "completion_length": 2748.4584350585938, | |
| "epoch": 0.5154285714285715, | |
| "grad_norm": 1.2674349546432495, | |
| "kl": 0.348876953125, | |
| "learning_rate": 1.260741462457165e-07, | |
| "loss": 0.0753, | |
| "reward": 0.851899653673172, | |
| "reward_std": 0.9279103875160217, | |
| "rewards/cosine_scaled_reward": 0.019699793308973312, | |
| "rewards/format_reward": 0.8125000298023224, | |
| "step": 451 | |
| }, | |
| { | |
| "completion_length": 2946.291748046875, | |
| "epoch": 0.5165714285714286, | |
| "grad_norm": 0.9848341941833496, | |
| "kl": 0.4384765625, | |
| "learning_rate": 1.2503063339313356e-07, | |
| "loss": 0.0244, | |
| "reward": 0.7191433683037758, | |
| "reward_std": 0.8444506227970123, | |
| "rewards/cosine_scaled_reward": -0.0154283307492733, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 452 | |
| }, | |
| { | |
| "completion_length": 2824.5000610351562, | |
| "epoch": 0.5177142857142857, | |
| "grad_norm": 1.562027931213379, | |
| "kl": 0.450439453125, | |
| "learning_rate": 1.2400783294793668e-07, | |
| "loss": 0.0678, | |
| "reward": 0.9857252687215805, | |
| "reward_std": 0.8770118951797485, | |
| "rewards/cosine_scaled_reward": 0.1074459683150053, | |
| "rewards/format_reward": 0.7708333432674408, | |
| "step": 453 | |
| }, | |
| { | |
| "completion_length": 2845.291748046875, | |
| "epoch": 0.5188571428571429, | |
| "grad_norm": 1.0593106746673584, | |
| "kl": 0.399658203125, | |
| "learning_rate": 1.2300579475997657e-07, | |
| "loss": 0.0191, | |
| "reward": 0.5798447616398335, | |
| "reward_std": 0.7729413360357285, | |
| "rewards/cosine_scaled_reward": -0.11632763035595417, | |
| "rewards/format_reward": 0.8125, | |
| "step": 454 | |
| }, | |
| { | |
| "completion_length": 2406.979248046875, | |
| "epoch": 0.52, | |
| "grad_norm": 0.4025033712387085, | |
| "kl": 0.32861328125, | |
| "learning_rate": 1.220245676671809e-07, | |
| "loss": 0.0397, | |
| "reward": 1.0016262233257294, | |
| "reward_std": 0.6507641598582268, | |
| "rewards/cosine_scaled_reward": 0.104979757219553, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 455 | |
| }, | |
| { | |
| "completion_length": 2493.3125915527344, | |
| "epoch": 0.5211428571428571, | |
| "grad_norm": 0.6641373038291931, | |
| "kl": 0.3935546875, | |
| "learning_rate": 1.2106419949317388e-07, | |
| "loss": 0.0496, | |
| "reward": 0.8123725727200508, | |
| "reward_std": 0.6888710185885429, | |
| "rewards/cosine_scaled_reward": 0.04160293936729431, | |
| "rewards/format_reward": 0.7291666865348816, | |
| "step": 456 | |
| }, | |
| { | |
| "completion_length": 2579.354248046875, | |
| "epoch": 0.5222857142857142, | |
| "grad_norm": 0.3551529347896576, | |
| "kl": 0.35302734375, | |
| "learning_rate": 1.2012473704494537e-07, | |
| "loss": 0.0275, | |
| "reward": 0.670621931552887, | |
| "reward_std": 0.6615720614790916, | |
| "rewards/cosine_scaled_reward": -0.06052236817777157, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 457 | |
| }, | |
| { | |
| "completion_length": 2468.854217529297, | |
| "epoch": 0.5234285714285715, | |
| "grad_norm": 0.5066484212875366, | |
| "kl": 0.423828125, | |
| "learning_rate": 1.1920622611056974e-07, | |
| "loss": 0.0466, | |
| "reward": 1.0128154456615448, | |
| "reward_std": 0.9961100518703461, | |
| "rewards/cosine_scaled_reward": 0.08974102255888283, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 458 | |
| }, | |
| { | |
| "completion_length": 2763.354248046875, | |
| "epoch": 0.5245714285714286, | |
| "grad_norm": 0.7024835348129272, | |
| "kl": 0.363037109375, | |
| "learning_rate": 1.1830871145697412e-07, | |
| "loss": 0.0672, | |
| "reward": 0.604728564620018, | |
| "reward_std": 0.7839554250240326, | |
| "rewards/cosine_scaled_reward": -0.11430239118635654, | |
| "rewards/format_reward": 0.833333358168602, | |
| "step": 459 | |
| }, | |
| { | |
| "completion_length": 2871.4584350585938, | |
| "epoch": 0.5257142857142857, | |
| "grad_norm": 0.6273028254508972, | |
| "kl": 0.372314453125, | |
| "learning_rate": 1.1743223682775649e-07, | |
| "loss": 0.0527, | |
| "reward": 1.0004199892282486, | |
| "reward_std": 0.8981437683105469, | |
| "rewards/cosine_scaled_reward": 0.1147933267056942, | |
| "rewards/format_reward": 0.770833358168602, | |
| "step": 460 | |
| }, | |
| { | |
| "completion_length": 2718.2709350585938, | |
| "epoch": 0.5268571428571428, | |
| "grad_norm": 0.46946173906326294, | |
| "kl": 0.447021484375, | |
| "learning_rate": 1.1657684494105386e-07, | |
| "loss": 0.0404, | |
| "reward": 1.022796869277954, | |
| "reward_std": 0.7989484220743179, | |
| "rewards/cosine_scaled_reward": 0.12598175182938576, | |
| "rewards/format_reward": 0.7708333432674408, | |
| "step": 461 | |
| }, | |
| { | |
| "completion_length": 2926.0416870117188, | |
| "epoch": 0.528, | |
| "grad_norm": 1.261118769645691, | |
| "kl": 0.525390625, | |
| "learning_rate": 1.1574257748745986e-07, | |
| "loss": 0.0831, | |
| "reward": 0.7424125671386719, | |
| "reward_std": 0.9555595070123672, | |
| "rewards/cosine_scaled_reward": -0.0037937182933092117, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 462 | |
| }, | |
| { | |
| "completion_length": 2262.4376220703125, | |
| "epoch": 0.5291428571428571, | |
| "grad_norm": 0.5456348657608032, | |
| "kl": 0.3070068359375, | |
| "learning_rate": 1.1492947512799328e-07, | |
| "loss": 0.0543, | |
| "reward": 1.0686239376664162, | |
| "reward_std": 0.6754159927368164, | |
| "rewards/cosine_scaled_reward": 0.1488952711224556, | |
| "rewards/format_reward": 0.7708333432674408, | |
| "step": 463 | |
| }, | |
| { | |
| "completion_length": 3016.8958740234375, | |
| "epoch": 0.5302857142857142, | |
| "grad_norm": 1.5390175580978394, | |
| "kl": 0.45947265625, | |
| "learning_rate": 1.1413757749211602e-07, | |
| "loss": 0.1119, | |
| "reward": 0.8216940313577652, | |
| "reward_std": 1.1384240239858627, | |
| "rewards/cosine_scaled_reward": 0.03584700915962458, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 464 | |
| }, | |
| { | |
| "completion_length": 2775.0208740234375, | |
| "epoch": 0.5314285714285715, | |
| "grad_norm": 1.5516222715377808, | |
| "kl": 0.47607421875, | |
| "learning_rate": 1.1336692317580158e-07, | |
| "loss": 0.0147, | |
| "reward": 0.7128820940852165, | |
| "reward_std": 0.8897013664245605, | |
| "rewards/cosine_scaled_reward": -0.018558980314992368, | |
| "rewards/format_reward": 0.7500000298023224, | |
| "step": 465 | |
| }, | |
| { | |
| "completion_length": 2875.3333740234375, | |
| "epoch": 0.5325714285714286, | |
| "grad_norm": 0.6315276622772217, | |
| "kl": 0.55029296875, | |
| "learning_rate": 1.1261754973965422e-07, | |
| "loss": 0.0399, | |
| "reward": 0.6401756927371025, | |
| "reward_std": 0.7611015811562538, | |
| "rewards/cosine_scaled_reward": -0.054912167601287365, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 466 | |
| }, | |
| { | |
| "completion_length": 2514.8750610351562, | |
| "epoch": 0.5337142857142857, | |
| "grad_norm": 0.43570035696029663, | |
| "kl": 0.39990234375, | |
| "learning_rate": 1.1188949370707787e-07, | |
| "loss": 0.0301, | |
| "reward": 0.6949951946735382, | |
| "reward_std": 0.7680038511753082, | |
| "rewards/cosine_scaled_reward": -0.06916908174753189, | |
| "rewards/format_reward": 0.8333333730697632, | |
| "step": 467 | |
| }, | |
| { | |
| "completion_length": 2586.1458740234375, | |
| "epoch": 0.5348571428571428, | |
| "grad_norm": 0.6298258304595947, | |
| "kl": 0.396484375, | |
| "learning_rate": 1.1118279056249653e-07, | |
| "loss": 0.0409, | |
| "reward": 1.2849786281585693, | |
| "reward_std": 0.9066727161407471, | |
| "rewards/cosine_scaled_reward": 0.2570726328995079, | |
| "rewards/format_reward": 0.770833358168602, | |
| "step": 468 | |
| }, | |
| { | |
| "completion_length": 2279.604248046875, | |
| "epoch": 0.536, | |
| "grad_norm": 0.42815151810646057, | |
| "kl": 0.2633056640625, | |
| "learning_rate": 1.1049747474962444e-07, | |
| "loss": 0.015, | |
| "reward": 0.784978911280632, | |
| "reward_std": 0.6496678665280342, | |
| "rewards/cosine_scaled_reward": -0.04501055763103068, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 469 | |
| }, | |
| { | |
| "completion_length": 2119.416748046875, | |
| "epoch": 0.5371428571428571, | |
| "grad_norm": 1.2341870069503784, | |
| "kl": 0.427001953125, | |
| "learning_rate": 1.0983357966978745e-07, | |
| "loss": 0.054, | |
| "reward": 0.6538757495582104, | |
| "reward_std": 0.8121753484010696, | |
| "rewards/cosine_scaled_reward": -0.037645455449819565, | |
| "rewards/format_reward": 0.7291666865348816, | |
| "step": 470 | |
| }, | |
| { | |
| "completion_length": 2927.5834350585938, | |
| "epoch": 0.5382857142857143, | |
| "grad_norm": 0.632990300655365, | |
| "kl": 0.61474609375, | |
| "learning_rate": 1.0919113768029517e-07, | |
| "loss": 0.0463, | |
| "reward": 0.5753965899348259, | |
| "reward_std": 0.8329771310091019, | |
| "rewards/cosine_scaled_reward": -0.1081350538879633, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 471 | |
| }, | |
| { | |
| "completion_length": 2382.7500610351562, | |
| "epoch": 0.5394285714285715, | |
| "grad_norm": 0.4871074855327606, | |
| "kl": 0.42333984375, | |
| "learning_rate": 1.0857018009286381e-07, | |
| "loss": 0.0518, | |
| "reward": 1.0515232384204865, | |
| "reward_std": 0.8982365727424622, | |
| "rewards/cosine_scaled_reward": 0.16117826476693153, | |
| "rewards/format_reward": 0.7291666865348816, | |
| "step": 472 | |
| }, | |
| { | |
| "completion_length": 2928.9584350585938, | |
| "epoch": 0.5405714285714286, | |
| "grad_norm": 1.3636996746063232, | |
| "kl": 0.498779296875, | |
| "learning_rate": 1.0797073717209013e-07, | |
| "loss": 0.03, | |
| "reward": 0.32807744294404984, | |
| "reward_std": 0.5049104988574982, | |
| "rewards/cosine_scaled_reward": -0.21096128597855568, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 473 | |
| }, | |
| { | |
| "completion_length": 2577.0625610351562, | |
| "epoch": 0.5417142857142857, | |
| "grad_norm": 1.3398447036743164, | |
| "kl": 0.351806640625, | |
| "learning_rate": 1.0739283813397639e-07, | |
| "loss": 0.0931, | |
| "reward": 0.9431183338165283, | |
| "reward_std": 0.893795982003212, | |
| "rewards/cosine_scaled_reward": 0.08614248159574345, | |
| "rewards/format_reward": 0.7708333432674408, | |
| "step": 474 | |
| }, | |
| { | |
| "completion_length": 2737.3751220703125, | |
| "epoch": 0.5428571428571428, | |
| "grad_norm": 1.3732081651687622, | |
| "kl": 0.3955078125, | |
| "learning_rate": 1.068365111445064e-07, | |
| "loss": 0.0893, | |
| "reward": 0.8586708009243011, | |
| "reward_std": 0.8809327185153961, | |
| "rewards/cosine_scaled_reward": 0.06475206837058067, | |
| "rewards/format_reward": 0.7291666865348816, | |
| "step": 475 | |
| }, | |
| { | |
| "completion_length": 2780.3125610351562, | |
| "epoch": 0.544, | |
| "grad_norm": 1.55986750125885, | |
| "kl": 0.4127197265625, | |
| "learning_rate": 1.063017833182728e-07, | |
| "loss": 0.0047, | |
| "reward": 0.8244488090276718, | |
| "reward_std": 0.7860056459903717, | |
| "rewards/cosine_scaled_reward": 0.05805772356688976, | |
| "rewards/format_reward": 0.7083333432674408, | |
| "step": 476 | |
| }, | |
| { | |
| "completion_length": 2252.229248046875, | |
| "epoch": 0.5451428571428572, | |
| "grad_norm": 0.784569263458252, | |
| "kl": 0.378082275390625, | |
| "learning_rate": 1.0578868071715544e-07, | |
| "loss": 0.0354, | |
| "reward": 1.200981080532074, | |
| "reward_std": 0.7509779334068298, | |
| "rewards/cosine_scaled_reward": 0.1734071932733059, | |
| "rewards/format_reward": 0.8541666865348816, | |
| "step": 477 | |
| }, | |
| { | |
| "completion_length": 2425.9583740234375, | |
| "epoch": 0.5462857142857143, | |
| "grad_norm": 0.4835829436779022, | |
| "kl": 0.4466552734375, | |
| "learning_rate": 1.0529722834905125e-07, | |
| "loss": 0.0496, | |
| "reward": 0.7616169229149818, | |
| "reward_std": 0.6851886659860611, | |
| "rewards/cosine_scaled_reward": -0.035858187824487686, | |
| "rewards/format_reward": 0.8333333432674408, | |
| "step": 478 | |
| }, | |
| { | |
| "completion_length": 2255.3750610351562, | |
| "epoch": 0.5474285714285714, | |
| "grad_norm": 0.9519103765487671, | |
| "kl": 0.386962890625, | |
| "learning_rate": 1.0482745016665526e-07, | |
| "loss": 0.0216, | |
| "reward": 0.9349322374910116, | |
| "reward_std": 0.613688588142395, | |
| "rewards/cosine_scaled_reward": 0.08204942103475332, | |
| "rewards/format_reward": 0.770833358168602, | |
| "step": 479 | |
| }, | |
| { | |
| "completion_length": 2591.8959350585938, | |
| "epoch": 0.5485714285714286, | |
| "grad_norm": 0.619563102722168, | |
| "kl": 0.52685546875, | |
| "learning_rate": 1.0437936906629334e-07, | |
| "loss": 0.042, | |
| "reward": 0.7943236902356148, | |
| "reward_std": 1.037893146276474, | |
| "rewards/cosine_scaled_reward": 0.06382851302623749, | |
| "rewards/format_reward": 0.6666666865348816, | |
| "step": 480 | |
| }, | |
| { | |
| "completion_length": 2677.4791870117188, | |
| "epoch": 0.5497142857142857, | |
| "grad_norm": 0.45002222061157227, | |
| "kl": 0.56689453125, | |
| "learning_rate": 1.0395300688680625e-07, | |
| "loss": 0.0405, | |
| "reward": 0.5671083256602287, | |
| "reward_std": 0.708008423447609, | |
| "rewards/cosine_scaled_reward": -0.10186250880360603, | |
| "rewards/format_reward": 0.7708333432674408, | |
| "step": 481 | |
| }, | |
| { | |
| "completion_length": 2174.2291870117188, | |
| "epoch": 0.5508571428571428, | |
| "grad_norm": 0.3016662299633026, | |
| "kl": 0.340087890625, | |
| "learning_rate": 1.0354838440848501e-07, | |
| "loss": 0.0176, | |
| "reward": 0.9943665787577629, | |
| "reward_std": 0.5935569703578949, | |
| "rewards/cosine_scaled_reward": 0.04926658235490322, | |
| "rewards/format_reward": 0.8958333432674408, | |
| "step": 482 | |
| }, | |
| { | |
| "completion_length": 2582.6459350585938, | |
| "epoch": 0.552, | |
| "grad_norm": 0.7917870879173279, | |
| "kl": 0.466064453125, | |
| "learning_rate": 1.0316552135205837e-07, | |
| "loss": 0.0279, | |
| "reward": 0.6264216639101505, | |
| "reward_std": 0.9700927287340164, | |
| "rewards/cosine_scaled_reward": -0.04095582733862102, | |
| "rewards/format_reward": 0.708333358168602, | |
| "step": 483 | |
| }, | |
| { | |
| "completion_length": 2702.2708740234375, | |
| "epoch": 0.5531428571428572, | |
| "grad_norm": 0.5935311317443848, | |
| "kl": 0.388427734375, | |
| "learning_rate": 1.0280443637773163e-07, | |
| "loss": 0.0319, | |
| "reward": 0.9062394499778748, | |
| "reward_std": 0.7218269556760788, | |
| "rewards/cosine_scaled_reward": 0.06770304590463638, | |
| "rewards/format_reward": 0.770833358168602, | |
| "step": 484 | |
| }, | |
| { | |
| "completion_length": 2429.1458435058594, | |
| "epoch": 0.5542857142857143, | |
| "grad_norm": 0.7909466028213501, | |
| "kl": 0.4248046875, | |
| "learning_rate": 1.0246514708427701e-07, | |
| "loss": 0.0119, | |
| "reward": 0.7019704282283783, | |
| "reward_std": 0.6897935420274734, | |
| "rewards/cosine_scaled_reward": -0.0448481235653162, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 485 | |
| }, | |
| { | |
| "completion_length": 2677.291748046875, | |
| "epoch": 0.5554285714285714, | |
| "grad_norm": 1.1475855112075806, | |
| "kl": 0.327880859375, | |
| "learning_rate": 1.0214767000817596e-07, | |
| "loss": 0.0051, | |
| "reward": 1.1072902642190456, | |
| "reward_std": 0.7692115753889084, | |
| "rewards/cosine_scaled_reward": 0.14739511162042618, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 486 | |
| }, | |
| { | |
| "completion_length": 2795.8750610351562, | |
| "epoch": 0.5565714285714286, | |
| "grad_norm": 0.5653597116470337, | |
| "kl": 0.3798828125, | |
| "learning_rate": 1.0185202062281336e-07, | |
| "loss": 0.0451, | |
| "reward": 0.7866236716508865, | |
| "reward_std": 0.6821945160627365, | |
| "rewards/cosine_scaled_reward": -0.02335483953356743, | |
| "rewards/format_reward": 0.833333358168602, | |
| "step": 487 | |
| }, | |
| { | |
| "completion_length": 2895.3751220703125, | |
| "epoch": 0.5577142857142857, | |
| "grad_norm": 0.4974069893360138, | |
| "kl": 0.4326171875, | |
| "learning_rate": 1.0157821333772304e-07, | |
| "loss": 0.062, | |
| "reward": 0.5221007950603962, | |
| "reward_std": 0.8605436235666275, | |
| "rewards/cosine_scaled_reward": -0.1035329382866621, | |
| "rewards/format_reward": 0.7291667014360428, | |
| "step": 488 | |
| }, | |
| { | |
| "completion_length": 2012.541748046875, | |
| "epoch": 0.5588571428571428, | |
| "grad_norm": 0.5164794921875, | |
| "kl": 0.254058837890625, | |
| "learning_rate": 1.013262614978859e-07, | |
| "loss": 0.0022, | |
| "reward": 1.416559837758541, | |
| "reward_std": 0.6288183927536011, | |
| "rewards/cosine_scaled_reward": 0.2707799021154642, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 489 | |
| }, | |
| { | |
| "completion_length": 2575.6668090820312, | |
| "epoch": 0.56, | |
| "grad_norm": 0.8971602916717529, | |
| "kl": 0.3701171875, | |
| "learning_rate": 1.0109617738307911e-07, | |
| "loss": 0.0111, | |
| "reward": 0.5933700278401375, | |
| "reward_std": 0.6079118028283119, | |
| "rewards/cosine_scaled_reward": -0.1408149916678667, | |
| "rewards/format_reward": 0.8750000149011612, | |
| "step": 490 | |
| }, | |
| { | |
| "completion_length": 2626.854217529297, | |
| "epoch": 0.5611428571428572, | |
| "grad_norm": 0.7071827054023743, | |
| "kl": 0.3095703125, | |
| "learning_rate": 1.0088797220727779e-07, | |
| "loss": 0.043, | |
| "reward": 0.9613501131534576, | |
| "reward_std": 0.8130423650145531, | |
| "rewards/cosine_scaled_reward": 0.07442504540085793, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 491 | |
| }, | |
| { | |
| "completion_length": 2436.7291870117188, | |
| "epoch": 0.5622857142857143, | |
| "grad_norm": 0.44464409351348877, | |
| "kl": 0.28173828125, | |
| "learning_rate": 1.0070165611810855e-07, | |
| "loss": 0.0244, | |
| "reward": 0.7668804228305817, | |
| "reward_std": 0.6314697042107582, | |
| "rewards/cosine_scaled_reward": -0.07489313930273056, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 492 | |
| }, | |
| { | |
| "completion_length": 2737.0209350585938, | |
| "epoch": 0.5634285714285714, | |
| "grad_norm": 0.5461977124214172, | |
| "kl": 0.404296875, | |
| "learning_rate": 1.005372381963547e-07, | |
| "loss": 0.038, | |
| "reward": 0.5373080670833588, | |
| "reward_std": 0.7348825931549072, | |
| "rewards/cosine_scaled_reward": -0.11676262941909954, | |
| "rewards/format_reward": 0.770833358168602, | |
| "step": 493 | |
| }, | |
| { | |
| "completion_length": 2169.5416870117188, | |
| "epoch": 0.5645714285714286, | |
| "grad_norm": 0.2975417971611023, | |
| "kl": 0.2210693359375, | |
| "learning_rate": 1.0039472645551372e-07, | |
| "loss": 0.0104, | |
| "reward": 0.6467055715620518, | |
| "reward_std": 0.6691789701581001, | |
| "rewards/cosine_scaled_reward": -0.10373054444789886, | |
| "rewards/format_reward": 0.8541666716337204, | |
| "step": 494 | |
| }, | |
| { | |
| "completion_length": 2759.9793090820312, | |
| "epoch": 0.5657142857142857, | |
| "grad_norm": 0.7536102533340454, | |
| "kl": 0.2822265625, | |
| "learning_rate": 1.002741278414069e-07, | |
| "loss": 0.0527, | |
| "reward": 1.0850744023919106, | |
| "reward_std": 0.9734541922807693, | |
| "rewards/cosine_scaled_reward": 0.13628720492124557, | |
| "rewards/format_reward": 0.8125000149011612, | |
| "step": 495 | |
| }, | |
| { | |
| "completion_length": 2828.6459350585938, | |
| "epoch": 0.5668571428571428, | |
| "grad_norm": 0.7388039231300354, | |
| "kl": 0.3896484375, | |
| "learning_rate": 1.0017544823184055e-07, | |
| "loss": 0.0107, | |
| "reward": 0.9930586367845535, | |
| "reward_std": 0.9435475766658783, | |
| "rewards/cosine_scaled_reward": 0.1215293172863312, | |
| "rewards/format_reward": 0.7500000298023224, | |
| "step": 496 | |
| }, | |
| { | |
| "completion_length": 2202.437530517578, | |
| "epoch": 0.568, | |
| "grad_norm": 0.4381030201911926, | |
| "kl": 0.25762939453125, | |
| "learning_rate": 1.0009869243631952e-07, | |
| "loss": 0.0376, | |
| "reward": 1.1173406671732664, | |
| "reward_std": 0.5638850405812263, | |
| "rewards/cosine_scaled_reward": 0.1836703196167946, | |
| "rewards/format_reward": 0.7500000149011612, | |
| "step": 497 | |
| }, | |
| { | |
| "completion_length": 2922.4376220703125, | |
| "epoch": 0.5691428571428572, | |
| "grad_norm": 0.3199293315410614, | |
| "kl": 0.4296875, | |
| "learning_rate": 1.000438641958131e-07, | |
| "loss": 0.0655, | |
| "reward": 0.23180836997926235, | |
| "reward_std": 0.6018998995423317, | |
| "rewards/cosine_scaled_reward": -0.23826248571276665, | |
| "rewards/format_reward": 0.708333358168602, | |
| "step": 498 | |
| }, | |
| { | |
| "completion_length": 2946.0626220703125, | |
| "epoch": 0.5702857142857143, | |
| "grad_norm": 0.9604411125183105, | |
| "kl": 0.41943359375, | |
| "learning_rate": 1.0001096618257236e-07, | |
| "loss": 0.068, | |
| "reward": 0.4334963224828243, | |
| "reward_std": 0.9516143649816513, | |
| "rewards/cosine_scaled_reward": -0.1270018396899104, | |
| "rewards/format_reward": 0.6875000149011612, | |
| "step": 499 | |
| }, | |
| { | |
| "completion_length": 2515.8958740234375, | |
| "epoch": 0.5714285714285714, | |
| "grad_norm": 1.0595104694366455, | |
| "kl": 0.287109375, | |
| "learning_rate": 1e-07, | |
| "loss": 0.0511, | |
| "reward": 0.935544490814209, | |
| "reward_std": 1.0099718570709229, | |
| "rewards/cosine_scaled_reward": 0.07193891797214746, | |
| "rewards/format_reward": 0.7916666865348816, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.5714285714285714, | |
| "step": 500, | |
| "total_flos": 0.0, | |
| "train_loss": 0.008432806108146906, | |
| "train_runtime": 8817.9865, | |
| "train_samples_per_second": 2.722, | |
| "train_steps_per_second": 0.057 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 500, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |