{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.007706178814173204, "eval_steps": 500, "global_step": 250, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 3.082471525669282e-05, "grad_norm": 0.13662848638776823, "kl": 0.0, "learning_rate": 1e-05, "loss": 0.0, "reward": 0.2896246537566185, "reward_std": 0.043548169545829296, "rewards/clip_reward": 0.2896246537566185, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 6.164943051338563e-05, "grad_norm": 0.1178878537123743, "kl": 0.0007257461547851562, "learning_rate": 1e-05, "loss": 0.0, "reward": 0.28567346930503845, "reward_std": 0.04437257535755634, "rewards/clip_reward": 0.28567346930503845, "step": 2 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 9.247414577007844e-05, "grad_norm": 0.3404139762424654, "kl": 0.0029239654541015625, "learning_rate": 1e-05, "loss": 0.0001, "reward": 0.26198844239115715, "reward_std": 0.03637277893722057, "rewards/clip_reward": 0.26198844239115715, "step": 3 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.00012329886102677127, "grad_norm": 0.21757091715643603, "kl": 0.001544952392578125, "learning_rate": 1e-05, "loss": 0.0001, "reward": 0.2846095412969589, "reward_std": 0.03777279099449515, "rewards/clip_reward": 0.2846095412969589, "step": 4 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.00015412357628346408, "grad_norm": 0.1348527934598492, "kl": 0.002452850341796875, "learning_rate": 1e-05, "loss": 0.0001, "reward": 0.23306814581155777, "reward_std": 0.033804881386458874, "rewards/clip_reward": 0.23306814581155777, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0001849482915401569, "grad_norm": 0.14393009622892527, "kl": 0.00441741943359375, "learning_rate": 1e-05, "loss": 0.0002, "reward": 0.2847321555018425, "reward_std": 0.040111628361046314, "rewards/clip_reward": 0.2847321555018425, "step": 6 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.00021577300679684973, "grad_norm": 0.14351852552309294, "kl": 0.00428009033203125, "learning_rate": 1e-05, "loss": 0.0002, "reward": 0.2802872806787491, "reward_std": 0.0383415911346674, "rewards/clip_reward": 0.2802872806787491, "step": 7 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.00024659772205354254, "grad_norm": 0.16776667321961602, "kl": 0.00494384765625, "learning_rate": 1e-05, "loss": 0.0002, "reward": 0.2881240174174309, "reward_std": 0.04321274207904935, "rewards/clip_reward": 0.2881240174174309, "step": 8 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0002774224373102353, "grad_norm": 0.13329464822711182, "kl": 0.009368896484375, "learning_rate": 1e-05, "loss": 0.0004, "reward": 0.2757921889424324, "reward_std": 0.042105874978005886, "rewards/clip_reward": 0.2757921889424324, "step": 9 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.00030824715256692816, "grad_norm": 0.12887604952918402, "kl": 0.00988006591796875, "learning_rate": 1e-05, "loss": 0.0004, "reward": 0.287712462246418, "reward_std": 0.03645364008843899, "rewards/clip_reward": 0.287712462246418, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.000339071867823621, "grad_norm": 0.1231047237735355, "kl": 0.0100250244140625, "learning_rate": 1e-05, "loss": 0.0004, "reward": 0.24786356836557388, "reward_std": 0.036720491014420986, "rewards/clip_reward": 0.24786356836557388, "step": 11 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0003698965830803138, "grad_norm": 0.3414993544210762, "kl": 0.01570892333984375, "learning_rate": 1e-05, "loss": 0.0006, "reward": 0.31398245692253113, "reward_std": 0.05272817797958851, "rewards/clip_reward": 0.31398245692253113, "step": 12 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0004007212983370066, "grad_norm": 0.13599349338370845, "kl": 0.014190673828125, "learning_rate": 1e-05, "loss": 0.0006, "reward": 0.28073475882411003, "reward_std": 0.04901007656008005, "rewards/clip_reward": 0.28073475882411003, "step": 13 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.00043154601359369945, "grad_norm": 0.127187147587617, "kl": 0.0131378173828125, "learning_rate": 1e-05, "loss": 0.0005, "reward": 0.280636228621006, "reward_std": 0.039431299082934856, "rewards/clip_reward": 0.280636228621006, "step": 14 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.00046237072885039224, "grad_norm": 0.13215878836878708, "kl": 0.0188140869140625, "learning_rate": 1e-05, "loss": 0.0008, "reward": 0.29201044142246246, "reward_std": 0.035117349587380886, "rewards/clip_reward": 0.29201044142246246, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0004931954441070851, "grad_norm": 0.12256785878118313, "kl": 0.022613525390625, "learning_rate": 1e-05, "loss": 0.0009, "reward": 0.2798103988170624, "reward_std": 0.03762377658858895, "rewards/clip_reward": 0.2798103988170624, "step": 16 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0005240201593637779, "grad_norm": 0.19802700549242463, "kl": 0.02978515625, "learning_rate": 1e-05, "loss": 0.0012, "reward": 0.27924566715955734, "reward_std": 0.04653105605393648, "rewards/clip_reward": 0.27924566715955734, "step": 17 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0005548448746204706, "grad_norm": 0.16347182492777684, "kl": 0.020050048828125, "learning_rate": 1e-05, "loss": 0.0008, "reward": 0.2851836755871773, "reward_std": 0.034420196898281574, "rewards/clip_reward": 0.2851836755871773, "step": 18 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0005856695898771635, "grad_norm": 0.1227504829797276, "kl": 0.02288818359375, "learning_rate": 1e-05, "loss": 0.0009, "reward": 0.3039173483848572, "reward_std": 0.04395513795316219, "rewards/clip_reward": 0.3039173483848572, "step": 19 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0006164943051338563, "grad_norm": 0.11185292631745263, "kl": 0.01983642578125, "learning_rate": 1e-05, "loss": 0.0008, "reward": 0.2992554157972336, "reward_std": 0.043223864398896694, "rewards/clip_reward": 0.2992554157972336, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0006473190203905491, "grad_norm": 0.11079321237999702, "kl": 0.0237884521484375, "learning_rate": 1e-05, "loss": 0.001, "reward": 0.2894679084420204, "reward_std": 0.03523569507524371, "rewards/clip_reward": 0.2894679084420204, "step": 21 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.000678143735647242, "grad_norm": 0.12194111403192436, "kl": 0.02587890625, "learning_rate": 1e-05, "loss": 0.001, "reward": 0.2820267304778099, "reward_std": 0.040216268971562386, "rewards/clip_reward": 0.2820267304778099, "step": 22 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0007089684509039348, "grad_norm": 0.14764947004412698, "kl": 0.030364990234375, "learning_rate": 1e-05, "loss": 0.0012, "reward": 0.30307962000370026, "reward_std": 0.0421298248693347, "rewards/clip_reward": 0.30307962000370026, "step": 23 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0007397931661606276, "grad_norm": 0.13740957875900825, "kl": 0.029144287109375, "learning_rate": 1e-05, "loss": 0.0012, "reward": 0.30139467120170593, "reward_std": 0.03585191536694765, "rewards/clip_reward": 0.30139467120170593, "step": 24 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0007706178814173204, "grad_norm": 0.1589748523019112, "kl": 0.030548095703125, "learning_rate": 1e-05, "loss": 0.0012, "reward": 0.28962523490190506, "reward_std": 0.03408448817208409, "rewards/clip_reward": 0.28962523490190506, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0008014425966740132, "grad_norm": 0.12748131467646742, "kl": 0.023712158203125, "learning_rate": 1e-05, "loss": 0.001, "reward": 0.2800466865301132, "reward_std": 0.03166115842759609, "rewards/clip_reward": 0.2800466865301132, "step": 26 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.000832267311930706, "grad_norm": 0.10841245478899014, "kl": 0.02532958984375, "learning_rate": 1e-05, "loss": 0.001, "reward": 0.277116097509861, "reward_std": 0.038001535926014185, "rewards/clip_reward": 0.277116097509861, "step": 27 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0008630920271873989, "grad_norm": 0.12709001525834415, "kl": 0.02606201171875, "learning_rate": 1e-05, "loss": 0.001, "reward": 0.27604615688323975, "reward_std": 0.033548878505825996, "rewards/clip_reward": 0.27604615688323975, "step": 28 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0008939167424440917, "grad_norm": 0.21267185014697698, "kl": 0.036163330078125, "learning_rate": 1e-05, "loss": 0.0014, "reward": 0.28682367503643036, "reward_std": 0.04210791550576687, "rewards/clip_reward": 0.28682367503643036, "step": 29 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0009247414577007845, "grad_norm": 0.18532382846975015, "kl": 0.033203125, "learning_rate": 1e-05, "loss": 0.0013, "reward": 0.2868439294397831, "reward_std": 0.03913262952119112, "rewards/clip_reward": 0.2868439294397831, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0009555661729574773, "grad_norm": 0.12356243000047522, "kl": 0.03118896484375, "learning_rate": 1e-05, "loss": 0.0012, "reward": 0.2863647863268852, "reward_std": 0.04316131863743067, "rewards/clip_reward": 0.2863647863268852, "step": 31 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0009863908882141701, "grad_norm": 0.110931968286718, "kl": 0.027008056640625, "learning_rate": 1e-05, "loss": 0.0011, "reward": 0.2939675599336624, "reward_std": 0.039077806286513805, "rewards/clip_reward": 0.2939675599336624, "step": 32 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.001017215603470863, "grad_norm": 0.2545347541932697, "kl": 0.02960205078125, "learning_rate": 1e-05, "loss": 0.0012, "reward": 0.2887604981660843, "reward_std": 0.043353252578526735, "rewards/clip_reward": 0.2887604981660843, "step": 33 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0010480403187275557, "grad_norm": 0.12476611864849307, "kl": 0.035308837890625, "learning_rate": 1e-05, "loss": 0.0014, "reward": 0.2868190184235573, "reward_std": 0.04044362064450979, "rewards/clip_reward": 0.2868190184235573, "step": 34 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0010788650339842486, "grad_norm": 0.12751972659201788, "kl": 0.031951904296875, "learning_rate": 1e-05, "loss": 0.0013, "reward": 0.2924063131213188, "reward_std": 0.03874353598803282, "rewards/clip_reward": 0.2924063131213188, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0011096897492409413, "grad_norm": 0.11592072388218817, "kl": 0.033538818359375, "learning_rate": 1e-05, "loss": 0.0013, "reward": 0.3099871575832367, "reward_std": 0.040243714582175016, "rewards/clip_reward": 0.3099871575832367, "step": 36 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0011405144644976342, "grad_norm": 0.11691018858992809, "kl": 0.028228759765625, "learning_rate": 1e-05, "loss": 0.0011, "reward": 0.29740212112665176, "reward_std": 0.03775101434439421, "rewards/clip_reward": 0.29740212112665176, "step": 37 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.001171339179754327, "grad_norm": 0.12694146379271082, "kl": 0.029876708984375, "learning_rate": 1e-05, "loss": 0.0012, "reward": 0.29666946083307266, "reward_std": 0.03896902687847614, "rewards/clip_reward": 0.29666946083307266, "step": 38 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0012021638950110197, "grad_norm": 0.11382088518050484, "kl": 0.034820556640625, "learning_rate": 1e-05, "loss": 0.0014, "reward": 0.25845085084438324, "reward_std": 0.035137762781232595, "rewards/clip_reward": 0.25845085084438324, "step": 39 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0012329886102677126, "grad_norm": 0.11785591889735988, "kl": 0.03558349609375, "learning_rate": 1e-05, "loss": 0.0014, "reward": 0.3000144585967064, "reward_std": 0.041856614872813225, "rewards/clip_reward": 0.3000144585967064, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0012638133255244055, "grad_norm": 0.14974893291010552, "kl": 0.03472900390625, "learning_rate": 1e-05, "loss": 0.0014, "reward": 0.292422890663147, "reward_std": 0.03956524468958378, "rewards/clip_reward": 0.292422890663147, "step": 41 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0012946380407810982, "grad_norm": 0.12625257582654123, "kl": 0.043701171875, "learning_rate": 1e-05, "loss": 0.0017, "reward": 0.2874472737312317, "reward_std": 0.03826928976923227, "rewards/clip_reward": 0.2874472737312317, "step": 42 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.001325462756037791, "grad_norm": 0.15391621735175193, "kl": 0.052886962890625, "learning_rate": 1e-05, "loss": 0.0021, "reward": 0.29409360885620117, "reward_std": 0.03215181827545166, "rewards/clip_reward": 0.29409360885620117, "step": 43 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.001356287471294484, "grad_norm": 0.11856390130857818, "kl": 0.02789306640625, "learning_rate": 1e-05, "loss": 0.0011, "reward": 0.3074764534831047, "reward_std": 0.03519732179120183, "rewards/clip_reward": 0.3074764534831047, "step": 44 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0013871121865511767, "grad_norm": 0.11525897073099471, "kl": 0.02886962890625, "learning_rate": 1e-05, "loss": 0.0012, "reward": 0.290186382830143, "reward_std": 0.042675744742155075, "rewards/clip_reward": 0.290186382830143, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0014179369018078695, "grad_norm": 0.11130066543273066, "kl": 0.031402587890625, "learning_rate": 1e-05, "loss": 0.0013, "reward": 0.2904057502746582, "reward_std": 0.03623047983273864, "rewards/clip_reward": 0.2904057502746582, "step": 46 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0014487616170645624, "grad_norm": 0.12239293095726622, "kl": 0.037353515625, "learning_rate": 1e-05, "loss": 0.0015, "reward": 0.30481819808483124, "reward_std": 0.032313164323568344, "rewards/clip_reward": 0.30481819808483124, "step": 47 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0014795863323212551, "grad_norm": 0.12373536144300279, "kl": 0.03485107421875, "learning_rate": 1e-05, "loss": 0.0014, "reward": 0.29724979400634766, "reward_std": 0.03943999111652374, "rewards/clip_reward": 0.29724979400634766, "step": 48 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.001510411047577948, "grad_norm": 0.1159028647129839, "kl": 0.03643798828125, "learning_rate": 1e-05, "loss": 0.0015, "reward": 0.29280055314302444, "reward_std": 0.037895018234848976, "rewards/clip_reward": 0.29280055314302444, "step": 49 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.001541235762834641, "grad_norm": 0.11547994419061709, "kl": 0.027984619140625, "learning_rate": 1e-05, "loss": 0.0011, "reward": 0.2938268706202507, "reward_std": 0.03788345959037542, "rewards/clip_reward": 0.2938268706202507, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0015720604780913336, "grad_norm": 1.6877147367273317, "kl": 0.1016845703125, "learning_rate": 1e-05, "loss": 0.0041, "reward": 0.29082879424095154, "reward_std": 0.039864601101726294, "rewards/clip_reward": 0.29082879424095154, "step": 51 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0016028851933480265, "grad_norm": 0.21224329234905434, "kl": 0.060516357421875, "learning_rate": 1e-05, "loss": 0.0024, "reward": 0.29952527582645416, "reward_std": 0.029881142545491457, "rewards/clip_reward": 0.29952527582645416, "step": 52 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0016337099086047194, "grad_norm": 0.11290164286969186, "kl": 0.029541015625, "learning_rate": 1e-05, "loss": 0.0012, "reward": 0.2998390942811966, "reward_std": 0.035071507561951876, "rewards/clip_reward": 0.2998390942811966, "step": 53 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.001664534623861412, "grad_norm": 0.11400285072826415, "kl": 0.027587890625, "learning_rate": 1e-05, "loss": 0.0011, "reward": 0.29009225964546204, "reward_std": 0.03698861412703991, "rewards/clip_reward": 0.29009225964546204, "step": 54 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.001695359339118105, "grad_norm": 0.13065371322254837, "kl": 0.028961181640625, "learning_rate": 1e-05, "loss": 0.0012, "reward": 0.3071891888976097, "reward_std": 0.029143241234123707, "rewards/clip_reward": 0.3071891888976097, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0017261840543747978, "grad_norm": 0.1129906144070696, "kl": 0.02569580078125, "learning_rate": 1e-05, "loss": 0.001, "reward": 0.29338589310646057, "reward_std": 0.03269250225275755, "rewards/clip_reward": 0.29338589310646057, "step": 56 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0017570087696314905, "grad_norm": 0.11727642814637977, "kl": 0.0286865234375, "learning_rate": 1e-05, "loss": 0.0011, "reward": 0.31817278265953064, "reward_std": 0.03473840607330203, "rewards/clip_reward": 0.31817278265953064, "step": 57 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0017878334848881834, "grad_norm": 0.13164061182282957, "kl": 0.02825927734375, "learning_rate": 1e-05, "loss": 0.0011, "reward": 0.2971828728914261, "reward_std": 0.03734842874109745, "rewards/clip_reward": 0.2971828728914261, "step": 58 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.001818658200144876, "grad_norm": 0.11765242827689301, "kl": 0.029510498046875, "learning_rate": 1e-05, "loss": 0.0012, "reward": 0.31315483897924423, "reward_std": 0.03106481023132801, "rewards/clip_reward": 0.31315483897924423, "step": 59 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.001849482915401569, "grad_norm": 0.12677244684117328, "kl": 0.023529052734375, "learning_rate": 1e-05, "loss": 0.0009, "reward": 0.3002154156565666, "reward_std": 0.03606006037443876, "rewards/clip_reward": 0.3002154156565666, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0018803076306582618, "grad_norm": 0.136174367743151, "kl": 0.034423828125, "learning_rate": 1e-05, "loss": 0.0014, "reward": 0.2926482856273651, "reward_std": 0.03695660084486008, "rewards/clip_reward": 0.2926482856273651, "step": 61 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0019111323459149545, "grad_norm": 0.11353122868188088, "kl": 0.025970458984375, "learning_rate": 1e-05, "loss": 0.001, "reward": 0.29593927413225174, "reward_std": 0.03822559863328934, "rewards/clip_reward": 0.29593927413225174, "step": 62 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0019419570611716474, "grad_norm": 0.11947371382126508, "kl": 0.02886962890625, "learning_rate": 1e-05, "loss": 0.0012, "reward": 0.2990872785449028, "reward_std": 0.03435507323592901, "rewards/clip_reward": 0.2990872785449028, "step": 63 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0019727817764283403, "grad_norm": 0.10960459564919689, "kl": 0.027374267578125, "learning_rate": 1e-05, "loss": 0.0011, "reward": 0.2825084328651428, "reward_std": 0.037316225469112396, "rewards/clip_reward": 0.2825084328651428, "step": 64 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.002003606491685033, "grad_norm": 0.10946023679973685, "kl": 0.024322509765625, "learning_rate": 1e-05, "loss": 0.001, "reward": 0.3126995787024498, "reward_std": 0.035230320412665606, "rewards/clip_reward": 0.3126995787024498, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.002034431206941726, "grad_norm": 0.10983182888889798, "kl": 0.0341796875, "learning_rate": 1e-05, "loss": 0.0014, "reward": 0.2903076857328415, "reward_std": 0.03482948988676071, "rewards/clip_reward": 0.2903076857328415, "step": 66 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0020652559221984185, "grad_norm": 0.11287812931379468, "kl": 0.02862548828125, "learning_rate": 1e-05, "loss": 0.0011, "reward": 0.30673525482416153, "reward_std": 0.03648731391876936, "rewards/clip_reward": 0.30673525482416153, "step": 67 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0020960806374551114, "grad_norm": 0.1116123252766076, "kl": 0.028045654296875, "learning_rate": 1e-05, "loss": 0.0011, "reward": 0.3118920400738716, "reward_std": 0.04191158525645733, "rewards/clip_reward": 0.3118920400738716, "step": 68 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0021269053527118043, "grad_norm": 0.13046284746258094, "kl": 0.0286865234375, "learning_rate": 1e-05, "loss": 0.0011, "reward": 0.3048221841454506, "reward_std": 0.03315945668146014, "rewards/clip_reward": 0.3048221841454506, "step": 69 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.002157730067968497, "grad_norm": 0.12197157089162045, "kl": 0.025115966796875, "learning_rate": 1e-05, "loss": 0.001, "reward": 0.2929798662662506, "reward_std": 0.03310262132436037, "rewards/clip_reward": 0.2929798662662506, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.00218855478322519, "grad_norm": 0.10833880759656236, "kl": 0.02850341796875, "learning_rate": 1e-05, "loss": 0.0011, "reward": 0.2930976450443268, "reward_std": 0.03318624943494797, "rewards/clip_reward": 0.2930976450443268, "step": 71 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0022193794984818826, "grad_norm": 0.11609773141793645, "kl": 0.02789306640625, "learning_rate": 1e-05, "loss": 0.0011, "reward": 0.2967787832021713, "reward_std": 0.0352731691673398, "rewards/clip_reward": 0.2967787832021713, "step": 72 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0022502042137385755, "grad_norm": 0.10524474043777819, "kl": 0.02496337890625, "learning_rate": 1e-05, "loss": 0.001, "reward": 0.31397951394319534, "reward_std": 0.03302141930907965, "rewards/clip_reward": 0.31397951394319534, "step": 73 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0022810289289952683, "grad_norm": 0.11645830499407812, "kl": 0.03143310546875, "learning_rate": 1e-05, "loss": 0.0013, "reward": 0.304028183221817, "reward_std": 0.03270072164013982, "rewards/clip_reward": 0.304028183221817, "step": 74 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0023118536442519612, "grad_norm": 0.11708657920762414, "kl": 0.0277099609375, "learning_rate": 1e-05, "loss": 0.0011, "reward": 0.29009127616882324, "reward_std": 0.034541524946689606, "rewards/clip_reward": 0.29009127616882324, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.002342678359508654, "grad_norm": 0.1474963588837119, "kl": 0.029022216796875, "learning_rate": 1e-05, "loss": 0.0012, "reward": 0.3042903319001198, "reward_std": 0.03551435098052025, "rewards/clip_reward": 0.3042903319001198, "step": 76 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.002373503074765347, "grad_norm": 0.11094369240221044, "kl": 0.03167724609375, "learning_rate": 1e-05, "loss": 0.0013, "reward": 0.3129897713661194, "reward_std": 0.030104911886155605, "rewards/clip_reward": 0.3129897713661194, "step": 77 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0024043277900220395, "grad_norm": 0.11276532754709433, "kl": 0.029083251953125, "learning_rate": 1e-05, "loss": 0.0012, "reward": 0.28792131692171097, "reward_std": 0.03244967618957162, "rewards/clip_reward": 0.28792131692171097, "step": 78 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0024351525052787324, "grad_norm": 2.4407080756052175, "kl": 0.2354736328125, "learning_rate": 1e-05, "loss": 0.0094, "reward": 0.3102322220802307, "reward_std": 0.0350488992407918, "rewards/clip_reward": 0.3102322220802307, "step": 79 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0024659772205354253, "grad_norm": 0.1159358540029852, "kl": 0.035400390625, "learning_rate": 1e-05, "loss": 0.0014, "reward": 0.30823151022195816, "reward_std": 0.039351899176836014, "rewards/clip_reward": 0.30823151022195816, "step": 80 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.002496801935792118, "grad_norm": 0.12888152498248232, "kl": 0.027679443359375, "learning_rate": 1e-05, "loss": 0.0011, "reward": 0.31155603379011154, "reward_std": 0.03785201674327254, "rewards/clip_reward": 0.31155603379011154, "step": 81 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.002527626651048811, "grad_norm": 0.118057549023165, "kl": 0.031951904296875, "learning_rate": 1e-05, "loss": 0.0013, "reward": 0.2901509776711464, "reward_std": 0.03677979623898864, "rewards/clip_reward": 0.2901509776711464, "step": 82 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.002558451366305504, "grad_norm": 0.13671900392730388, "kl": 0.03436279296875, "learning_rate": 1e-05, "loss": 0.0014, "reward": 0.31552984565496445, "reward_std": 0.03665575571358204, "rewards/clip_reward": 0.31552984565496445, "step": 83 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0025892760815621964, "grad_norm": 0.1338150548332209, "kl": 0.02691650390625, "learning_rate": 1e-05, "loss": 0.0011, "reward": 0.29462432861328125, "reward_std": 0.030553956981748343, "rewards/clip_reward": 0.29462432861328125, "step": 84 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0026201007968188893, "grad_norm": 0.11938476789667336, "kl": 0.035247802734375, "learning_rate": 1e-05, "loss": 0.0014, "reward": 0.29117922484874725, "reward_std": 0.034703842364251614, "rewards/clip_reward": 0.29117922484874725, "step": 85 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.002650925512075582, "grad_norm": 0.15290800659433915, "kl": 0.02923583984375, "learning_rate": 1e-05, "loss": 0.0012, "reward": 0.30999132990837097, "reward_std": 0.03528518043458462, "rewards/clip_reward": 0.30999132990837097, "step": 86 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.002681750227332275, "grad_norm": 0.1901908492516557, "kl": 0.035125732421875, "learning_rate": 1e-05, "loss": 0.0014, "reward": 0.30646923929452896, "reward_std": 0.030330040026456118, "rewards/clip_reward": 0.30646923929452896, "step": 87 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.002712574942588968, "grad_norm": 0.11770721369651402, "kl": 0.033416748046875, "learning_rate": 1e-05, "loss": 0.0013, "reward": 0.29444392770528793, "reward_std": 0.03116408735513687, "rewards/clip_reward": 0.29444392770528793, "step": 88 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.002743399657845661, "grad_norm": 0.1826885288603463, "kl": 0.03265380859375, "learning_rate": 1e-05, "loss": 0.0013, "reward": 0.2949918806552887, "reward_std": 0.034480467438697815, "rewards/clip_reward": 0.2949918806552887, "step": 89 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0027742243731023533, "grad_norm": 0.12056299378953052, "kl": 0.03338623046875, "learning_rate": 1e-05, "loss": 0.0013, "reward": 0.31496553868055344, "reward_std": 0.03079960821196437, "rewards/clip_reward": 0.31496553868055344, "step": 90 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.002805049088359046, "grad_norm": 0.12457250512365349, "kl": 0.03619384765625, "learning_rate": 1e-05, "loss": 0.0014, "reward": 0.28923606872558594, "reward_std": 0.03267038939520717, "rewards/clip_reward": 0.28923606872558594, "step": 91 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.002835873803615739, "grad_norm": 1.5672006078532907, "kl": 0.07281494140625, "learning_rate": 1e-05, "loss": 0.0029, "reward": 0.3001748248934746, "reward_std": 0.03475807560607791, "rewards/clip_reward": 0.3001748248934746, "step": 92 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.002866698518872432, "grad_norm": 0.14261331472528152, "kl": 0.032684326171875, "learning_rate": 1e-05, "loss": 0.0013, "reward": 0.323921799659729, "reward_std": 0.03504910413175821, "rewards/clip_reward": 0.323921799659729, "step": 93 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.002897523234129125, "grad_norm": 0.11567854576839746, "kl": 0.0318603515625, "learning_rate": 1e-05, "loss": 0.0013, "reward": 0.3107759431004524, "reward_std": 0.03745063720270991, "rewards/clip_reward": 0.3107759431004524, "step": 94 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0029283479493858173, "grad_norm": 0.11598622472023176, "kl": 0.030853271484375, "learning_rate": 1e-05, "loss": 0.0012, "reward": 0.3175903409719467, "reward_std": 0.03149988315999508, "rewards/clip_reward": 0.3175903409719467, "step": 95 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0029591726646425102, "grad_norm": 0.1254209118683483, "kl": 0.032806396484375, "learning_rate": 1e-05, "loss": 0.0013, "reward": 0.28729958087205887, "reward_std": 0.031386380549520254, "rewards/clip_reward": 0.28729958087205887, "step": 96 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.002989997379899203, "grad_norm": 0.12174331071691413, "kl": 0.0316162109375, "learning_rate": 1e-05, "loss": 0.0013, "reward": 0.3118669241666794, "reward_std": 0.032158670015633106, "rewards/clip_reward": 0.3118669241666794, "step": 97 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.003020822095155896, "grad_norm": 0.11712521290025418, "kl": 0.03448486328125, "learning_rate": 1e-05, "loss": 0.0014, "reward": 0.31112753599882126, "reward_std": 0.043876828625798225, "rewards/clip_reward": 0.31112753599882126, "step": 98 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.003051646810412589, "grad_norm": 0.11859882642032223, "kl": 0.03875732421875, "learning_rate": 1e-05, "loss": 0.0016, "reward": 0.2820296436548233, "reward_std": 0.03406182769685984, "rewards/clip_reward": 0.2820296436548233, "step": 99 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.003082471525669282, "grad_norm": 0.12885755957691103, "kl": 0.03619384765625, "learning_rate": 1e-05, "loss": 0.0014, "reward": 0.30988558381795883, "reward_std": 0.03780778869986534, "rewards/clip_reward": 0.30988558381795883, "step": 100 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0031132962409259743, "grad_norm": 0.11306253389811041, "kl": 0.033660888671875, "learning_rate": 1e-05, "loss": 0.0013, "reward": 0.3102700859308243, "reward_std": 0.03908272087574005, "rewards/clip_reward": 0.3102700859308243, "step": 101 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.003144120956182667, "grad_norm": 0.11386105699536472, "kl": 0.0308837890625, "learning_rate": 1e-05, "loss": 0.0012, "reward": 0.29308290779590607, "reward_std": 0.03458858421072364, "rewards/clip_reward": 0.29308290779590607, "step": 102 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.00317494567143936, "grad_norm": 0.10250552377032608, "kl": 0.035369873046875, "learning_rate": 1e-05, "loss": 0.0014, "reward": 0.31484246253967285, "reward_std": 0.03848233912140131, "rewards/clip_reward": 0.31484246253967285, "step": 103 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.003205770386696053, "grad_norm": 0.11041408780399448, "kl": 0.03509521484375, "learning_rate": 1e-05, "loss": 0.0014, "reward": 0.30977974832057953, "reward_std": 0.0354487132281065, "rewards/clip_reward": 0.30977974832057953, "step": 104 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.003236595101952746, "grad_norm": 0.11590179364539747, "kl": 0.0321044921875, "learning_rate": 1e-05, "loss": 0.0013, "reward": 0.3202349618077278, "reward_std": 0.03078141063451767, "rewards/clip_reward": 0.3202349618077278, "step": 105 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0032674198172094387, "grad_norm": 0.14734135195006995, "kl": 0.035980224609375, "learning_rate": 1e-05, "loss": 0.0014, "reward": 0.3130447119474411, "reward_std": 0.031586550641804934, "rewards/clip_reward": 0.3130447119474411, "step": 106 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.003298244532466131, "grad_norm": 0.11436474499458421, "kl": 0.03759765625, "learning_rate": 1e-05, "loss": 0.0015, "reward": 0.2865230664610863, "reward_std": 0.027899319771677256, "rewards/clip_reward": 0.2865230664610863, "step": 107 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.003329069247722824, "grad_norm": 0.10968741552838084, "kl": 0.031463623046875, "learning_rate": 1e-05, "loss": 0.0013, "reward": 0.2957867458462715, "reward_std": 0.0352176409214735, "rewards/clip_reward": 0.2957867458462715, "step": 108 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.003359893962979517, "grad_norm": 0.12582085065454826, "kl": 0.030670166015625, "learning_rate": 1e-05, "loss": 0.0012, "reward": 0.3016505390405655, "reward_std": 0.04240915086120367, "rewards/clip_reward": 0.3016505390405655, "step": 109 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.00339071867823621, "grad_norm": 0.10773491440317534, "kl": 0.03265380859375, "learning_rate": 1e-05, "loss": 0.0013, "reward": 0.2948762997984886, "reward_std": 0.034737172070890665, "rewards/clip_reward": 0.2948762997984886, "step": 110 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0034215433934929027, "grad_norm": 0.10632490654255654, "kl": 0.03411865234375, "learning_rate": 1e-05, "loss": 0.0014, "reward": 0.2969469800591469, "reward_std": 0.03294783923774958, "rewards/clip_reward": 0.2969469800591469, "step": 111 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0034523681087495956, "grad_norm": 0.10780628812986831, "kl": 0.03314208984375, "learning_rate": 1e-05, "loss": 0.0013, "reward": 0.3059050217270851, "reward_std": 0.035596927627921104, "rewards/clip_reward": 0.3059050217270851, "step": 112 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.003483192824006288, "grad_norm": 0.10992023386661232, "kl": 0.0360107421875, "learning_rate": 1e-05, "loss": 0.0014, "reward": 0.3140244632959366, "reward_std": 0.031090704258531332, "rewards/clip_reward": 0.3140244632959366, "step": 113 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.003514017539262981, "grad_norm": 0.10972697905672188, "kl": 0.0379638671875, "learning_rate": 1e-05, "loss": 0.0015, "reward": 0.31535517424345016, "reward_std": 0.03570608049631119, "rewards/clip_reward": 0.31535517424345016, "step": 114 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.003544842254519674, "grad_norm": 0.11267471386768459, "kl": 0.03448486328125, "learning_rate": 1e-05, "loss": 0.0014, "reward": 0.2879750058054924, "reward_std": 0.04104418680071831, "rewards/clip_reward": 0.2879750058054924, "step": 115 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0035756669697763668, "grad_norm": 0.11162685314945246, "kl": 0.038330078125, "learning_rate": 1e-05, "loss": 0.0015, "reward": 0.2966529279947281, "reward_std": 0.03232752811163664, "rewards/clip_reward": 0.2966529279947281, "step": 116 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0036064916850330597, "grad_norm": 0.1078402292948255, "kl": 0.03717041015625, "learning_rate": 1e-05, "loss": 0.0015, "reward": 0.3015007972717285, "reward_std": 0.03359420504420996, "rewards/clip_reward": 0.3015007972717285, "step": 117 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.003637316400289752, "grad_norm": 0.11308735376651692, "kl": 0.03997802734375, "learning_rate": 1e-05, "loss": 0.0016, "reward": 0.2964186370372772, "reward_std": 0.02699094917625189, "rewards/clip_reward": 0.2964186370372772, "step": 118 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.003668141115546445, "grad_norm": 0.11055310636563517, "kl": 0.036590576171875, "learning_rate": 1e-05, "loss": 0.0015, "reward": 0.305886946618557, "reward_std": 0.03268259018659592, "rewards/clip_reward": 0.305886946618557, "step": 119 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.003698965830803138, "grad_norm": 0.10880733102325107, "kl": 0.04144287109375, "learning_rate": 1e-05, "loss": 0.0017, "reward": 0.3160470202565193, "reward_std": 0.03552013309672475, "rewards/clip_reward": 0.3160470202565193, "step": 120 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.003729790546059831, "grad_norm": 0.1084319678958757, "kl": 0.03759765625, "learning_rate": 1e-05, "loss": 0.0015, "reward": 0.30664851516485214, "reward_std": 0.03642770275473595, "rewards/clip_reward": 0.30664851516485214, "step": 121 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0037606152613165237, "grad_norm": 0.10716440463937717, "kl": 0.03912353515625, "learning_rate": 1e-05, "loss": 0.0016, "reward": 0.3061741515994072, "reward_std": 0.03255116753280163, "rewards/clip_reward": 0.3061741515994072, "step": 122 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0037914399765732166, "grad_norm": 0.10921412355349838, "kl": 0.04193115234375, "learning_rate": 1e-05, "loss": 0.0017, "reward": 0.3017224445939064, "reward_std": 0.035058747977018356, "rewards/clip_reward": 0.3017224445939064, "step": 123 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.003822264691829909, "grad_norm": 0.10891562876371483, "kl": 0.04351806640625, "learning_rate": 1e-05, "loss": 0.0017, "reward": 0.3088080510497093, "reward_std": 0.03202015720307827, "rewards/clip_reward": 0.3088080510497093, "step": 124 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.003853089407086602, "grad_norm": 0.12400681002324238, "kl": 0.0430908203125, "learning_rate": 1e-05, "loss": 0.0017, "reward": 0.30519475787878036, "reward_std": 0.04081529099494219, "rewards/clip_reward": 0.30519475787878036, "step": 125 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.003883914122343295, "grad_norm": 0.10684242248297056, "kl": 0.03955078125, "learning_rate": 1e-05, "loss": 0.0016, "reward": 0.2999924272298813, "reward_std": 0.027399181853979826, "rewards/clip_reward": 0.2999924272298813, "step": 126 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.003914738837599988, "grad_norm": 0.13383282223405826, "kl": 0.043212890625, "learning_rate": 1e-05, "loss": 0.0017, "reward": 0.29980389028787613, "reward_std": 0.02915497263893485, "rewards/clip_reward": 0.29980389028787613, "step": 127 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.003945563552856681, "grad_norm": 0.10680237923679747, "kl": 0.0423583984375, "learning_rate": 1e-05, "loss": 0.0017, "reward": 0.3113924115896225, "reward_std": 0.034374223090708256, "rewards/clip_reward": 0.3113924115896225, "step": 128 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0039763882681133735, "grad_norm": 0.11633733033299526, "kl": 0.0419921875, "learning_rate": 1e-05, "loss": 0.0017, "reward": 0.3037688434123993, "reward_std": 0.030008903238922358, "rewards/clip_reward": 0.3037688434123993, "step": 129 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.004007212983370066, "grad_norm": 0.10824185375409869, "kl": 0.043212890625, "learning_rate": 1e-05, "loss": 0.0017, "reward": 0.3037646785378456, "reward_std": 0.03128322120755911, "rewards/clip_reward": 0.3037646785378456, "step": 130 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.004038037698626759, "grad_norm": 0.11498093646788146, "kl": 0.04241943359375, "learning_rate": 1e-05, "loss": 0.0017, "reward": 0.3007878288626671, "reward_std": 0.035366450902074575, "rewards/clip_reward": 0.3007878288626671, "step": 131 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.004068862413883452, "grad_norm": 0.10988923049203525, "kl": 0.04412841796875, "learning_rate": 1e-05, "loss": 0.0018, "reward": 0.2908458858728409, "reward_std": 0.037285988219082355, "rewards/clip_reward": 0.2908458858728409, "step": 132 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.004099687129140144, "grad_norm": 0.11966799658540882, "kl": 0.0472412109375, "learning_rate": 1e-05, "loss": 0.0019, "reward": 0.31911730766296387, "reward_std": 0.03753689955919981, "rewards/clip_reward": 0.31911730766296387, "step": 133 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.004130511844396837, "grad_norm": 0.11694661938666558, "kl": 0.0423583984375, "learning_rate": 1e-05, "loss": 0.0017, "reward": 0.2962675616145134, "reward_std": 0.03178291115909815, "rewards/clip_reward": 0.2962675616145134, "step": 134 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.00416133655965353, "grad_norm": 0.12087212424369766, "kl": 0.04498291015625, "learning_rate": 1e-05, "loss": 0.0018, "reward": 0.2907417491078377, "reward_std": 0.039098432287573814, "rewards/clip_reward": 0.2907417491078377, "step": 135 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.004192161274910223, "grad_norm": 0.1287243807650102, "kl": 0.04473876953125, "learning_rate": 1e-05, "loss": 0.0018, "reward": 0.3177812397480011, "reward_std": 0.03574381256476045, "rewards/clip_reward": 0.3177812397480011, "step": 136 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.004222985990166916, "grad_norm": 0.10737365706668978, "kl": 0.04510498046875, "learning_rate": 1e-05, "loss": 0.0018, "reward": 0.3081594184041023, "reward_std": 0.038351588882505894, "rewards/clip_reward": 0.3081594184041023, "step": 137 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.004253810705423609, "grad_norm": 0.10905175589597935, "kl": 0.04962158203125, "learning_rate": 1e-05, "loss": 0.002, "reward": 0.3101942911744118, "reward_std": 0.036289566196501255, "rewards/clip_reward": 0.3101942911744118, "step": 138 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0042846354206803015, "grad_norm": 0.11276085217923483, "kl": 0.045166015625, "learning_rate": 1e-05, "loss": 0.0018, "reward": 0.30623695254325867, "reward_std": 0.03592977672815323, "rewards/clip_reward": 0.30623695254325867, "step": 139 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.004315460135936994, "grad_norm": 0.11293325800639886, "kl": 0.04168701171875, "learning_rate": 1e-05, "loss": 0.0017, "reward": 0.30889374017715454, "reward_std": 0.037142093293368816, "rewards/clip_reward": 0.30889374017715454, "step": 140 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.004346284851193687, "grad_norm": 0.11777523910607922, "kl": 0.0472412109375, "learning_rate": 1e-05, "loss": 0.0019, "reward": 0.3147361949086189, "reward_std": 0.03566309390589595, "rewards/clip_reward": 0.3147361949086189, "step": 141 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.00437710956645038, "grad_norm": 0.11847683399423899, "kl": 0.04412841796875, "learning_rate": 1e-05, "loss": 0.0018, "reward": 0.30750318616628647, "reward_std": 0.031021112576127052, "rewards/clip_reward": 0.30750318616628647, "step": 142 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.004407934281707073, "grad_norm": 0.2325766643168979, "kl": 0.04681396484375, "learning_rate": 1e-05, "loss": 0.0019, "reward": 0.3154006227850914, "reward_std": 0.03396408865228295, "rewards/clip_reward": 0.3154006227850914, "step": 143 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.004438758996963765, "grad_norm": 0.12205440467431385, "kl": 0.04742431640625, "learning_rate": 1e-05, "loss": 0.0019, "reward": 0.3048614263534546, "reward_std": 0.03609074279665947, "rewards/clip_reward": 0.3048614263534546, "step": 144 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.004469583712220458, "grad_norm": 0.23235604214068906, "kl": 0.04681396484375, "learning_rate": 1e-05, "loss": 0.0019, "reward": 0.311428502202034, "reward_std": 0.03360119927674532, "rewards/clip_reward": 0.311428502202034, "step": 145 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.004500408427477151, "grad_norm": 0.10888274692722734, "kl": 0.04388427734375, "learning_rate": 1e-05, "loss": 0.0018, "reward": 0.31328508257865906, "reward_std": 0.03548012813553214, "rewards/clip_reward": 0.31328508257865906, "step": 146 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.004531233142733844, "grad_norm": 0.10880868321131348, "kl": 0.04425048828125, "learning_rate": 1e-05, "loss": 0.0018, "reward": 0.3104088082909584, "reward_std": 0.03424055827781558, "rewards/clip_reward": 0.3104088082909584, "step": 147 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.004562057857990537, "grad_norm": 0.13184897587638666, "kl": 0.03912353515625, "learning_rate": 1e-05, "loss": 0.0016, "reward": 0.32070276886224747, "reward_std": 0.035730645060539246, "rewards/clip_reward": 0.32070276886224747, "step": 148 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.00459288257324723, "grad_norm": 0.13040013350171695, "kl": 0.036773681640625, "learning_rate": 1e-05, "loss": 0.0015, "reward": 0.28690390288829803, "reward_std": 0.031756586860865355, "rewards/clip_reward": 0.28690390288829803, "step": 149 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0046237072885039225, "grad_norm": 0.11413869565578771, "kl": 0.0396728515625, "learning_rate": 1e-05, "loss": 0.0016, "reward": 0.29507312178611755, "reward_std": 0.03319581504911184, "rewards/clip_reward": 0.29507312178611755, "step": 150 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.004654532003760615, "grad_norm": 0.11075253321749193, "kl": 0.04193115234375, "learning_rate": 1e-05, "loss": 0.0017, "reward": 0.3052366375923157, "reward_std": 0.03602536814287305, "rewards/clip_reward": 0.3052366375923157, "step": 151 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.004685356719017308, "grad_norm": 0.10754829754075829, "kl": 0.0465087890625, "learning_rate": 1e-05, "loss": 0.0019, "reward": 0.32933981716632843, "reward_std": 0.03219308517873287, "rewards/clip_reward": 0.32933981716632843, "step": 152 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.004716181434274001, "grad_norm": 0.11311221875180785, "kl": 0.03924560546875, "learning_rate": 1e-05, "loss": 0.0016, "reward": 0.32051652669906616, "reward_std": 0.03876081760972738, "rewards/clip_reward": 0.32051652669906616, "step": 153 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.004747006149530694, "grad_norm": 0.10485333011345627, "kl": 0.04144287109375, "learning_rate": 1e-05, "loss": 0.0017, "reward": 0.3038826063275337, "reward_std": 0.03334263851866126, "rewards/clip_reward": 0.3038826063275337, "step": 154 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.004777830864787387, "grad_norm": 0.2148098423660176, "kl": 0.05029296875, "learning_rate": 1e-05, "loss": 0.002, "reward": 0.3050876185297966, "reward_std": 0.03197276359423995, "rewards/clip_reward": 0.3050876185297966, "step": 155 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.004808655580044079, "grad_norm": 0.11844316361896051, "kl": 0.03863525390625, "learning_rate": 1e-05, "loss": 0.0015, "reward": 0.32097869366407394, "reward_std": 0.029324380215257406, "rewards/clip_reward": 0.32097869366407394, "step": 156 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.004839480295300772, "grad_norm": 0.11792606634997942, "kl": 0.04254150390625, "learning_rate": 1e-05, "loss": 0.0017, "reward": 0.3132959380745888, "reward_std": 0.027422321029007435, "rewards/clip_reward": 0.3132959380745888, "step": 157 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.004870305010557465, "grad_norm": 0.11837395235594217, "kl": 0.0396728515625, "learning_rate": 1e-05, "loss": 0.0016, "reward": 0.3073809891939163, "reward_std": 0.03928511310368776, "rewards/clip_reward": 0.3073809891939163, "step": 158 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.004901129725814158, "grad_norm": 0.11129310536332628, "kl": 0.0430908203125, "learning_rate": 1e-05, "loss": 0.0017, "reward": 0.2984026074409485, "reward_std": 0.03139211004599929, "rewards/clip_reward": 0.2984026074409485, "step": 159 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0049319544410708505, "grad_norm": 0.11532428624713585, "kl": 0.043212890625, "learning_rate": 1e-05, "loss": 0.0017, "reward": 0.3008668124675751, "reward_std": 0.02627889020368457, "rewards/clip_reward": 0.3008668124675751, "step": 160 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.004962779156327543, "grad_norm": 0.11786021221814753, "kl": 0.049560546875, "learning_rate": 1e-05, "loss": 0.002, "reward": 0.28665469214320183, "reward_std": 0.03704976849257946, "rewards/clip_reward": 0.28665469214320183, "step": 161 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.004993603871584236, "grad_norm": 0.11010075273074264, "kl": 0.0458984375, "learning_rate": 1e-05, "loss": 0.0018, "reward": 0.3088128864765167, "reward_std": 0.03618460427969694, "rewards/clip_reward": 0.3088128864765167, "step": 162 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.005024428586840929, "grad_norm": 0.13239934947372414, "kl": 0.04388427734375, "learning_rate": 1e-05, "loss": 0.0018, "reward": 0.3036596402525902, "reward_std": 0.032796021085232496, "rewards/clip_reward": 0.3036596402525902, "step": 163 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.005055253302097622, "grad_norm": 0.11413695831587789, "kl": 0.04144287109375, "learning_rate": 1e-05, "loss": 0.0017, "reward": 0.304002046585083, "reward_std": 0.029863339848816395, "rewards/clip_reward": 0.304002046585083, "step": 164 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.005086078017354315, "grad_norm": 0.11564845823686422, "kl": 0.04547119140625, "learning_rate": 1e-05, "loss": 0.0018, "reward": 0.29191841185092926, "reward_std": 0.035626471508294344, "rewards/clip_reward": 0.29191841185092926, "step": 165 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.005116902732611008, "grad_norm": 0.11329154232078235, "kl": 0.0458984375, "learning_rate": 1e-05, "loss": 0.0018, "reward": 0.3206318989396095, "reward_std": 0.031096128281205893, "rewards/clip_reward": 0.3206318989396095, "step": 166 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0051477274478677, "grad_norm": 0.13085517741110167, "kl": 0.04510498046875, "learning_rate": 1e-05, "loss": 0.0018, "reward": 0.28686685115098953, "reward_std": 0.028212732169777155, "rewards/clip_reward": 0.28686685115098953, "step": 167 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.005178552163124393, "grad_norm": 0.35649465481543857, "kl": 0.05712890625, "learning_rate": 1e-05, "loss": 0.0023, "reward": 0.3119603246450424, "reward_std": 0.03256976744160056, "rewards/clip_reward": 0.3119603246450424, "step": 168 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.005209376878381086, "grad_norm": 0.13826424691903896, "kl": 0.047607421875, "learning_rate": 1e-05, "loss": 0.0019, "reward": 0.2963334918022156, "reward_std": 0.035677722189575434, "rewards/clip_reward": 0.2963334918022156, "step": 169 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.005240201593637779, "grad_norm": 0.1534567095384766, "kl": 0.05511474609375, "learning_rate": 1e-05, "loss": 0.0022, "reward": 0.3064122945070267, "reward_std": 0.030035972129553556, "rewards/clip_reward": 0.3064122945070267, "step": 170 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0052710263088944715, "grad_norm": 0.14535366087971044, "kl": 0.0533447265625, "learning_rate": 1e-05, "loss": 0.0021, "reward": 0.31411340832710266, "reward_std": 0.03346576215699315, "rewards/clip_reward": 0.31411340832710266, "step": 171 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.005301851024151164, "grad_norm": 0.11182081037914528, "kl": 0.04864501953125, "learning_rate": 1e-05, "loss": 0.0019, "reward": 0.31372761726379395, "reward_std": 0.028209302574396133, "rewards/clip_reward": 0.31372761726379395, "step": 172 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.005332675739407857, "grad_norm": 0.10998805611249249, "kl": 0.0506591796875, "learning_rate": 1e-05, "loss": 0.002, "reward": 0.3076253980398178, "reward_std": 0.03530415939167142, "rewards/clip_reward": 0.3076253980398178, "step": 173 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.00536350045466455, "grad_norm": 0.11782096135818707, "kl": 0.04974365234375, "learning_rate": 1e-05, "loss": 0.002, "reward": 0.31246717274188995, "reward_std": 0.029364202171564102, "rewards/clip_reward": 0.31246717274188995, "step": 174 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.005394325169921243, "grad_norm": 0.12510210623580764, "kl": 0.045166015625, "learning_rate": 1e-05, "loss": 0.0018, "reward": 0.3149753659963608, "reward_std": 0.03486820124089718, "rewards/clip_reward": 0.3149753659963608, "step": 175 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.005425149885177936, "grad_norm": 0.12073650226365598, "kl": 0.04754638671875, "learning_rate": 1e-05, "loss": 0.0019, "reward": 0.3087178245186806, "reward_std": 0.03398646041750908, "rewards/clip_reward": 0.3087178245186806, "step": 176 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.005455974600434629, "grad_norm": 0.16141440704371318, "kl": 0.046630859375, "learning_rate": 1e-05, "loss": 0.0019, "reward": 0.32408736646175385, "reward_std": 0.03735980670899153, "rewards/clip_reward": 0.32408736646175385, "step": 177 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.005486799315691322, "grad_norm": 0.12441494367513423, "kl": 0.04547119140625, "learning_rate": 1e-05, "loss": 0.0018, "reward": 0.32129447907209396, "reward_std": 0.02921806275844574, "rewards/clip_reward": 0.32129447907209396, "step": 178 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.005517624030948014, "grad_norm": 0.11472079048722336, "kl": 0.04193115234375, "learning_rate": 1e-05, "loss": 0.0017, "reward": 0.3252818211913109, "reward_std": 0.032147477846592665, "rewards/clip_reward": 0.3252818211913109, "step": 179 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.005548448746204707, "grad_norm": 0.11643678981280302, "kl": 0.04559326171875, "learning_rate": 1e-05, "loss": 0.0018, "reward": 0.31385669857263565, "reward_std": 0.031697194557636976, "rewards/clip_reward": 0.31385669857263565, "step": 180 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0055792734614613995, "grad_norm": 0.10985906766240239, "kl": 0.0438232421875, "learning_rate": 1e-05, "loss": 0.0018, "reward": 0.3218560591340065, "reward_std": 0.030952177941799164, "rewards/clip_reward": 0.3218560591340065, "step": 181 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.005610098176718092, "grad_norm": 0.12318124273748278, "kl": 0.044189453125, "learning_rate": 1e-05, "loss": 0.0018, "reward": 0.3146945610642433, "reward_std": 0.037078809924423695, "rewards/clip_reward": 0.3146945610642433, "step": 182 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.005640922891974785, "grad_norm": 0.10855110593795812, "kl": 0.0469970703125, "learning_rate": 1e-05, "loss": 0.0019, "reward": 0.3247036188840866, "reward_std": 0.03742914833128452, "rewards/clip_reward": 0.3247036188840866, "step": 183 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.005671747607231478, "grad_norm": 0.12463338581811485, "kl": 0.04730224609375, "learning_rate": 1e-05, "loss": 0.0019, "reward": 0.3007904663681984, "reward_std": 0.030957046430557966, "rewards/clip_reward": 0.3007904663681984, "step": 184 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.005702572322488171, "grad_norm": 0.11738351599765742, "kl": 0.04608154296875, "learning_rate": 1e-05, "loss": 0.0018, "reward": 0.3152424767613411, "reward_std": 0.032630473375320435, "rewards/clip_reward": 0.3152424767613411, "step": 185 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.005733397037744864, "grad_norm": 0.12425233709466331, "kl": 0.0426025390625, "learning_rate": 1e-05, "loss": 0.0017, "reward": 0.3105937987565994, "reward_std": 0.037436836399137974, "rewards/clip_reward": 0.3105937987565994, "step": 186 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.005764221753001557, "grad_norm": 0.24540233073575124, "kl": 0.0604248046875, "learning_rate": 1e-05, "loss": 0.0024, "reward": 0.3193615674972534, "reward_std": 0.02923456858843565, "rewards/clip_reward": 0.3193615674972534, "step": 187 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.00579504646825825, "grad_norm": 0.11122419357690229, "kl": 0.04248046875, "learning_rate": 1e-05, "loss": 0.0017, "reward": 0.29561520367860794, "reward_std": 0.03093513334169984, "rewards/clip_reward": 0.29561520367860794, "step": 188 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.005825871183514943, "grad_norm": 0.11723471576199568, "kl": 0.04541015625, "learning_rate": 1e-05, "loss": 0.0018, "reward": 0.3073701113462448, "reward_std": 0.03550923429429531, "rewards/clip_reward": 0.3073701113462448, "step": 189 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.005856695898771635, "grad_norm": 0.10846043413336356, "kl": 0.04443359375, "learning_rate": 1e-05, "loss": 0.0018, "reward": 0.3202974200248718, "reward_std": 0.03128951042890549, "rewards/clip_reward": 0.3202974200248718, "step": 190 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.005887520614028328, "grad_norm": 0.10684544727987075, "kl": 0.04400634765625, "learning_rate": 1e-05, "loss": 0.0018, "reward": 0.3150642290711403, "reward_std": 0.038218459114432335, "rewards/clip_reward": 0.3150642290711403, "step": 191 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0059183453292850205, "grad_norm": 0.12179014361836414, "kl": 0.0406494140625, "learning_rate": 1e-05, "loss": 0.0016, "reward": 0.3023750111460686, "reward_std": 0.028901703655719757, "rewards/clip_reward": 0.3023750111460686, "step": 192 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.005949170044541713, "grad_norm": 0.11287740721272002, "kl": 0.04296875, "learning_rate": 1e-05, "loss": 0.0017, "reward": 0.30537480115890503, "reward_std": 0.03340973751619458, "rewards/clip_reward": 0.30537480115890503, "step": 193 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.005979994759798406, "grad_norm": 0.11953948724468283, "kl": 0.0406494140625, "learning_rate": 1e-05, "loss": 0.0016, "reward": 0.2977057322859764, "reward_std": 0.041334839537739754, "rewards/clip_reward": 0.2977057322859764, "step": 194 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.006010819475055099, "grad_norm": 0.33669753751768144, "kl": 0.05206298828125, "learning_rate": 1e-05, "loss": 0.0021, "reward": 0.3174924701452255, "reward_std": 0.03484439663589001, "rewards/clip_reward": 0.3174924701452255, "step": 195 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.006041644190311792, "grad_norm": 0.13081401668031936, "kl": 0.0428466796875, "learning_rate": 1e-05, "loss": 0.0017, "reward": 0.31883813440799713, "reward_std": 0.03002287307754159, "rewards/clip_reward": 0.31883813440799713, "step": 196 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.006072468905568485, "grad_norm": 0.15362262357445108, "kl": 0.04388427734375, "learning_rate": 1e-05, "loss": 0.0018, "reward": 0.3148084282875061, "reward_std": 0.03563790209591389, "rewards/clip_reward": 0.3148084282875061, "step": 197 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.006103293620825178, "grad_norm": 0.12036606667480174, "kl": 0.04522705078125, "learning_rate": 1e-05, "loss": 0.0018, "reward": 0.3185431882739067, "reward_std": 0.03547387337312102, "rewards/clip_reward": 0.3185431882739067, "step": 198 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.006134118336081871, "grad_norm": 0.11265487428982754, "kl": 0.04522705078125, "learning_rate": 1e-05, "loss": 0.0018, "reward": 0.3013475388288498, "reward_std": 0.027531601022928953, "rewards/clip_reward": 0.3013475388288498, "step": 199 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.006164943051338564, "grad_norm": 0.12480397529853451, "kl": 0.04632568359375, "learning_rate": 1e-05, "loss": 0.0019, "reward": 0.2954695001244545, "reward_std": 0.035041794180870056, "rewards/clip_reward": 0.2954695001244545, "step": 200 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0061957677665952565, "grad_norm": 0.11326384693922859, "kl": 0.04779052734375, "learning_rate": 1e-05, "loss": 0.0019, "reward": 0.30719564110040665, "reward_std": 0.03018721006810665, "rewards/clip_reward": 0.30719564110040665, "step": 201 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0062265924818519485, "grad_norm": 0.14001014918782298, "kl": 0.0477294921875, "learning_rate": 1e-05, "loss": 0.0019, "reward": 0.27801472693681717, "reward_std": 0.029184456914663315, "rewards/clip_reward": 0.27801472693681717, "step": 202 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.006257417197108641, "grad_norm": 0.13598377277553347, "kl": 0.04864501953125, "learning_rate": 1e-05, "loss": 0.0019, "reward": 0.31848207861185074, "reward_std": 0.029095898382365704, "rewards/clip_reward": 0.31848207861185074, "step": 203 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.006288241912365334, "grad_norm": 0.12990427030922783, "kl": 0.04669189453125, "learning_rate": 1e-05, "loss": 0.0019, "reward": 0.3233681470155716, "reward_std": 0.02884372603148222, "rewards/clip_reward": 0.3233681470155716, "step": 204 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.006319066627622027, "grad_norm": 0.11337348510966652, "kl": 0.04705810546875, "learning_rate": 1e-05, "loss": 0.0019, "reward": 0.30711568146944046, "reward_std": 0.027397962752729654, "rewards/clip_reward": 0.30711568146944046, "step": 205 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.00634989134287872, "grad_norm": 0.11008239157225132, "kl": 0.04534912109375, "learning_rate": 1e-05, "loss": 0.0018, "reward": 0.31931688636541367, "reward_std": 0.029018502216786146, "rewards/clip_reward": 0.31931688636541367, "step": 206 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.006380716058135413, "grad_norm": 0.11005906943350524, "kl": 0.04534912109375, "learning_rate": 1e-05, "loss": 0.0018, "reward": 0.2889741063117981, "reward_std": 0.03417292470112443, "rewards/clip_reward": 0.2889741063117981, "step": 207 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.006411540773392106, "grad_norm": 0.13107793374479376, "kl": 0.04388427734375, "learning_rate": 1e-05, "loss": 0.0018, "reward": 0.316011942923069, "reward_std": 0.035842195618897676, "rewards/clip_reward": 0.316011942923069, "step": 208 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.006442365488648799, "grad_norm": 0.11921653233574052, "kl": 0.04827880859375, "learning_rate": 1e-05, "loss": 0.0019, "reward": 0.3120528683066368, "reward_std": 0.03157835826277733, "rewards/clip_reward": 0.3120528683066368, "step": 209 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.006473190203905492, "grad_norm": 0.11671423552831947, "kl": 0.04705810546875, "learning_rate": 1e-05, "loss": 0.0019, "reward": 0.3096734285354614, "reward_std": 0.02532344777137041, "rewards/clip_reward": 0.3096734285354614, "step": 210 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0065040149191621845, "grad_norm": 0.12478193702072561, "kl": 0.04876708984375, "learning_rate": 1e-05, "loss": 0.002, "reward": 0.3255714699625969, "reward_std": 0.03476850828155875, "rewards/clip_reward": 0.3255714699625969, "step": 211 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.006534839634418877, "grad_norm": 0.11234041456760836, "kl": 0.043701171875, "learning_rate": 1e-05, "loss": 0.0017, "reward": 0.3205392137169838, "reward_std": 0.02879873849451542, "rewards/clip_reward": 0.3205392137169838, "step": 212 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0065656643496755695, "grad_norm": 0.11484489412906124, "kl": 0.04779052734375, "learning_rate": 1e-05, "loss": 0.0019, "reward": 0.30573664605617523, "reward_std": 0.034792355727404356, "rewards/clip_reward": 0.30573664605617523, "step": 213 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.006596489064932262, "grad_norm": 0.112973116545128, "kl": 0.0565185546875, "learning_rate": 1e-05, "loss": 0.0023, "reward": 0.322362020611763, "reward_std": 0.040699029341340065, "rewards/clip_reward": 0.322362020611763, "step": 214 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.006627313780188955, "grad_norm": 0.11000967164413539, "kl": 0.0496826171875, "learning_rate": 1e-05, "loss": 0.002, "reward": 0.32849258929491043, "reward_std": 0.03333634790033102, "rewards/clip_reward": 0.32849258929491043, "step": 215 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.006658138495445648, "grad_norm": 0.11266446524081561, "kl": 0.04681396484375, "learning_rate": 1e-05, "loss": 0.0019, "reward": 0.3268875405192375, "reward_std": 0.041359793394804, "rewards/clip_reward": 0.3268875405192375, "step": 216 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.006688963210702341, "grad_norm": 0.1297650863707243, "kl": 0.05572509765625, "learning_rate": 1e-05, "loss": 0.0022, "reward": 0.3042534068226814, "reward_std": 0.029003426898270845, "rewards/clip_reward": 0.3042534068226814, "step": 217 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.006719787925959034, "grad_norm": 0.1224231057191533, "kl": 0.0494384765625, "learning_rate": 1e-05, "loss": 0.002, "reward": 0.322785347700119, "reward_std": 0.03285923460498452, "rewards/clip_reward": 0.322785347700119, "step": 218 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.006750612641215727, "grad_norm": 0.11197370609789116, "kl": 0.0545654296875, "learning_rate": 1e-05, "loss": 0.0022, "reward": 0.3130335509777069, "reward_std": 0.028378690592944622, "rewards/clip_reward": 0.3130335509777069, "step": 219 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.00678143735647242, "grad_norm": 0.12265292069342315, "kl": 0.0526123046875, "learning_rate": 1e-05, "loss": 0.0021, "reward": 0.3101393133401871, "reward_std": 0.031088348012417555, "rewards/clip_reward": 0.3101393133401871, "step": 220 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.006812262071729113, "grad_norm": 0.1301967649695606, "kl": 0.0504150390625, "learning_rate": 1e-05, "loss": 0.002, "reward": 0.3258736953139305, "reward_std": 0.030232697259634733, "rewards/clip_reward": 0.3258736953139305, "step": 221 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0068430867869858055, "grad_norm": 0.12267903014610237, "kl": 0.05633544921875, "learning_rate": 1e-05, "loss": 0.0023, "reward": 0.3135734647512436, "reward_std": 0.029037311673164368, "rewards/clip_reward": 0.3135734647512436, "step": 222 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.006873911502242498, "grad_norm": 0.1556499324095671, "kl": 0.05694580078125, "learning_rate": 1e-05, "loss": 0.0023, "reward": 0.3139733672142029, "reward_std": 0.03242505481466651, "rewards/clip_reward": 0.3139733672142029, "step": 223 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.006904736217499191, "grad_norm": 0.14040668849148338, "kl": 0.05712890625, "learning_rate": 1e-05, "loss": 0.0023, "reward": 0.2902194894850254, "reward_std": 0.030722644180059433, "rewards/clip_reward": 0.2902194894850254, "step": 224 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.006935560932755883, "grad_norm": 0.12469801158231446, "kl": 0.05535888671875, "learning_rate": 1e-05, "loss": 0.0022, "reward": 0.3171394020318985, "reward_std": 0.032871958799660206, "rewards/clip_reward": 0.3171394020318985, "step": 225 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.006966385648012576, "grad_norm": 0.11664661161299794, "kl": 0.0555419921875, "learning_rate": 1e-05, "loss": 0.0022, "reward": 0.30288901180028915, "reward_std": 0.03361931908875704, "rewards/clip_reward": 0.30288901180028915, "step": 226 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.006997210363269269, "grad_norm": 0.12368362093823822, "kl": 0.0579833984375, "learning_rate": 1e-05, "loss": 0.0023, "reward": 0.29918094724416733, "reward_std": 0.032192114274948835, "rewards/clip_reward": 0.29918094724416733, "step": 227 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.007028035078525962, "grad_norm": 0.13670193873160075, "kl": 0.0626220703125, "learning_rate": 1e-05, "loss": 0.0025, "reward": 0.30772917717695236, "reward_std": 0.03168489225208759, "rewards/clip_reward": 0.30772917717695236, "step": 228 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.007058859793782655, "grad_norm": 0.15557420264040966, "kl": 0.053955078125, "learning_rate": 1e-05, "loss": 0.0022, "reward": 0.28914518654346466, "reward_std": 0.03747545275837183, "rewards/clip_reward": 0.28914518654346466, "step": 229 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.007089684509039348, "grad_norm": 0.3182664775944573, "kl": 0.08154296875, "learning_rate": 1e-05, "loss": 0.0033, "reward": 0.311465322971344, "reward_std": 0.03380277007818222, "rewards/clip_reward": 0.311465322971344, "step": 230 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.007120509224296041, "grad_norm": 0.12693032913934957, "kl": 0.0616455078125, "learning_rate": 1e-05, "loss": 0.0025, "reward": 0.32272306829690933, "reward_std": 0.03391577862203121, "rewards/clip_reward": 0.32272306829690933, "step": 231 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0071513339395527335, "grad_norm": 4.296122241941977, "kl": 0.45172119140625, "learning_rate": 1e-05, "loss": 0.0181, "reward": 0.32933176308870316, "reward_std": 0.03596270922571421, "rewards/clip_reward": 0.32933176308870316, "step": 232 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.007182158654809426, "grad_norm": 0.12252452858451943, "kl": 0.06103515625, "learning_rate": 1e-05, "loss": 0.0024, "reward": 0.30073945224285126, "reward_std": 0.035156805999577045, "rewards/clip_reward": 0.30073945224285126, "step": 233 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.007212983370066119, "grad_norm": 0.18753510109293484, "kl": 0.0567626953125, "learning_rate": 1e-05, "loss": 0.0023, "reward": 0.3166045621037483, "reward_std": 0.036252960562705994, "rewards/clip_reward": 0.3166045621037483, "step": 234 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.007243808085322812, "grad_norm": 0.12244712083969155, "kl": 0.0599365234375, "learning_rate": 1e-05, "loss": 0.0024, "reward": 0.32098332792520523, "reward_std": 0.02912920992821455, "rewards/clip_reward": 0.32098332792520523, "step": 235 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.007274632800579504, "grad_norm": 0.1153924535202381, "kl": 0.057373046875, "learning_rate": 1e-05, "loss": 0.0023, "reward": 0.30156801640987396, "reward_std": 0.03506749775260687, "rewards/clip_reward": 0.30156801640987396, "step": 236 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.007305457515836197, "grad_norm": 982.1130719581894, "kl": 11.4212646484375, "learning_rate": 1e-05, "loss": 0.4579, "reward": 0.3112705275416374, "reward_std": 0.03293308801949024, "rewards/clip_reward": 0.3112705275416374, "step": 237 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.00733628223109289, "grad_norm": 0.13912658954576582, "kl": 0.06134033203125, "learning_rate": 1e-05, "loss": 0.0025, "reward": 0.31086497753858566, "reward_std": 0.03602492017671466, "rewards/clip_reward": 0.31086497753858566, "step": 238 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.007367106946349583, "grad_norm": 0.15900496360182007, "kl": 0.072021484375, "learning_rate": 1e-05, "loss": 0.0029, "reward": 0.301338754594326, "reward_std": 0.03604850126430392, "rewards/clip_reward": 0.301338754594326, "step": 239 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.007397931661606276, "grad_norm": 0.11682552010464033, "kl": 0.069091796875, "learning_rate": 1e-05, "loss": 0.0028, "reward": 0.310367226600647, "reward_std": 0.033799303229898214, "rewards/clip_reward": 0.310367226600647, "step": 240 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.007428756376862969, "grad_norm": 0.6484753019850781, "kl": 0.1041259765625, "learning_rate": 1e-05, "loss": 0.0042, "reward": 0.32271357625722885, "reward_std": 0.03783900523558259, "rewards/clip_reward": 0.32271357625722885, "step": 241 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.007459581092119662, "grad_norm": 0.12488153334727711, "kl": 0.06195068359375, "learning_rate": 1e-05, "loss": 0.0025, "reward": 0.30252621322870255, "reward_std": 0.03500718716531992, "rewards/clip_reward": 0.30252621322870255, "step": 242 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.0074904058073763545, "grad_norm": 0.11793262109697311, "kl": 0.0692138671875, "learning_rate": 1e-05, "loss": 0.0028, "reward": 0.3169676512479782, "reward_std": 0.03632183885201812, "rewards/clip_reward": 0.3169676512479782, "step": 243 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.007521230522633047, "grad_norm": 0.11916442015495367, "kl": 0.06756591796875, "learning_rate": 1e-05, "loss": 0.0027, "reward": 0.32120058685541153, "reward_std": 0.03682229993864894, "rewards/clip_reward": 0.32120058685541153, "step": 244 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.00755205523788974, "grad_norm": 0.13178282787519513, "kl": 0.068115234375, "learning_rate": 1e-05, "loss": 0.0027, "reward": 0.3041011393070221, "reward_std": 0.03216708730906248, "rewards/clip_reward": 0.3041011393070221, "step": 245 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.007582879953146433, "grad_norm": 0.1209756865386705, "kl": 0.07470703125, "learning_rate": 1e-05, "loss": 0.003, "reward": 0.30233804881572723, "reward_std": 0.03544025868177414, "rewards/clip_reward": 0.30233804881572723, "step": 246 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.007613704668403126, "grad_norm": 0.14259183948300264, "kl": 0.082275390625, "learning_rate": 1e-05, "loss": 0.0033, "reward": 0.297327883541584, "reward_std": 0.03194120712578297, "rewards/clip_reward": 0.297327883541584, "step": 247 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.007644529383659818, "grad_norm": 0.11966570448950067, "kl": 0.07708740234375, "learning_rate": 1e-05, "loss": 0.0031, "reward": 0.3017484247684479, "reward_std": 0.03380720689892769, "rewards/clip_reward": 0.3017484247684479, "step": 248 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.007675354098916511, "grad_norm": 0.11584912623106094, "kl": 0.079345703125, "learning_rate": 1e-05, "loss": 0.0032, "reward": 0.2936403974890709, "reward_std": 0.03349851304665208, "rewards/clip_reward": 0.2936403974890709, "step": 249 }, { "clip_ratio": 0.0, "completion_length": 4096.0, "epoch": 0.007706178814173204, "grad_norm": 0.13824008661423945, "kl": 0.0780029296875, "learning_rate": 1e-05, "loss": 0.0031, "reward": 0.3258681297302246, "reward_std": 0.04007330071181059, "rewards/clip_reward": 0.3258681297302246, "step": 250 } ], "logging_steps": 1, "max_steps": 32441, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 6, "trial_name": null, "trial_params": null }