| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.007706178814173204, | |
| "eval_steps": 500, | |
| "global_step": 250, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 3.082471525669282e-05, | |
| "grad_norm": 0.13662848638776823, | |
| "kl": 0.0, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 0.2896246537566185, | |
| "reward_std": 0.043548169545829296, | |
| "rewards/clip_reward": 0.2896246537566185, | |
| "step": 1 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 6.164943051338563e-05, | |
| "grad_norm": 0.1178878537123743, | |
| "kl": 0.0007257461547851562, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0, | |
| "reward": 0.28567346930503845, | |
| "reward_std": 0.04437257535755634, | |
| "rewards/clip_reward": 0.28567346930503845, | |
| "step": 2 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 9.247414577007844e-05, | |
| "grad_norm": 0.3404139762424654, | |
| "kl": 0.0029239654541015625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0001, | |
| "reward": 0.26198844239115715, | |
| "reward_std": 0.03637277893722057, | |
| "rewards/clip_reward": 0.26198844239115715, | |
| "step": 3 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.00012329886102677127, | |
| "grad_norm": 0.21757091715643603, | |
| "kl": 0.001544952392578125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0001, | |
| "reward": 0.2846095412969589, | |
| "reward_std": 0.03777279099449515, | |
| "rewards/clip_reward": 0.2846095412969589, | |
| "step": 4 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.00015412357628346408, | |
| "grad_norm": 0.1348527934598492, | |
| "kl": 0.002452850341796875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0001, | |
| "reward": 0.23306814581155777, | |
| "reward_std": 0.033804881386458874, | |
| "rewards/clip_reward": 0.23306814581155777, | |
| "step": 5 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0001849482915401569, | |
| "grad_norm": 0.14393009622892527, | |
| "kl": 0.00441741943359375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0002, | |
| "reward": 0.2847321555018425, | |
| "reward_std": 0.040111628361046314, | |
| "rewards/clip_reward": 0.2847321555018425, | |
| "step": 6 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.00021577300679684973, | |
| "grad_norm": 0.14351852552309294, | |
| "kl": 0.00428009033203125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0002, | |
| "reward": 0.2802872806787491, | |
| "reward_std": 0.0383415911346674, | |
| "rewards/clip_reward": 0.2802872806787491, | |
| "step": 7 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.00024659772205354254, | |
| "grad_norm": 0.16776667321961602, | |
| "kl": 0.00494384765625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0002, | |
| "reward": 0.2881240174174309, | |
| "reward_std": 0.04321274207904935, | |
| "rewards/clip_reward": 0.2881240174174309, | |
| "step": 8 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0002774224373102353, | |
| "grad_norm": 0.13329464822711182, | |
| "kl": 0.009368896484375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0004, | |
| "reward": 0.2757921889424324, | |
| "reward_std": 0.042105874978005886, | |
| "rewards/clip_reward": 0.2757921889424324, | |
| "step": 9 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.00030824715256692816, | |
| "grad_norm": 0.12887604952918402, | |
| "kl": 0.00988006591796875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0004, | |
| "reward": 0.287712462246418, | |
| "reward_std": 0.03645364008843899, | |
| "rewards/clip_reward": 0.287712462246418, | |
| "step": 10 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.000339071867823621, | |
| "grad_norm": 0.1231047237735355, | |
| "kl": 0.0100250244140625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0004, | |
| "reward": 0.24786356836557388, | |
| "reward_std": 0.036720491014420986, | |
| "rewards/clip_reward": 0.24786356836557388, | |
| "step": 11 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0003698965830803138, | |
| "grad_norm": 0.3414993544210762, | |
| "kl": 0.01570892333984375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0006, | |
| "reward": 0.31398245692253113, | |
| "reward_std": 0.05272817797958851, | |
| "rewards/clip_reward": 0.31398245692253113, | |
| "step": 12 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0004007212983370066, | |
| "grad_norm": 0.13599349338370845, | |
| "kl": 0.014190673828125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0006, | |
| "reward": 0.28073475882411003, | |
| "reward_std": 0.04901007656008005, | |
| "rewards/clip_reward": 0.28073475882411003, | |
| "step": 13 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.00043154601359369945, | |
| "grad_norm": 0.127187147587617, | |
| "kl": 0.0131378173828125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0005, | |
| "reward": 0.280636228621006, | |
| "reward_std": 0.039431299082934856, | |
| "rewards/clip_reward": 0.280636228621006, | |
| "step": 14 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.00046237072885039224, | |
| "grad_norm": 0.13215878836878708, | |
| "kl": 0.0188140869140625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0008, | |
| "reward": 0.29201044142246246, | |
| "reward_std": 0.035117349587380886, | |
| "rewards/clip_reward": 0.29201044142246246, | |
| "step": 15 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0004931954441070851, | |
| "grad_norm": 0.12256785878118313, | |
| "kl": 0.022613525390625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0009, | |
| "reward": 0.2798103988170624, | |
| "reward_std": 0.03762377658858895, | |
| "rewards/clip_reward": 0.2798103988170624, | |
| "step": 16 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0005240201593637779, | |
| "grad_norm": 0.19802700549242463, | |
| "kl": 0.02978515625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0012, | |
| "reward": 0.27924566715955734, | |
| "reward_std": 0.04653105605393648, | |
| "rewards/clip_reward": 0.27924566715955734, | |
| "step": 17 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0005548448746204706, | |
| "grad_norm": 0.16347182492777684, | |
| "kl": 0.020050048828125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0008, | |
| "reward": 0.2851836755871773, | |
| "reward_std": 0.034420196898281574, | |
| "rewards/clip_reward": 0.2851836755871773, | |
| "step": 18 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0005856695898771635, | |
| "grad_norm": 0.1227504829797276, | |
| "kl": 0.02288818359375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0009, | |
| "reward": 0.3039173483848572, | |
| "reward_std": 0.04395513795316219, | |
| "rewards/clip_reward": 0.3039173483848572, | |
| "step": 19 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0006164943051338563, | |
| "grad_norm": 0.11185292631745263, | |
| "kl": 0.01983642578125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0008, | |
| "reward": 0.2992554157972336, | |
| "reward_std": 0.043223864398896694, | |
| "rewards/clip_reward": 0.2992554157972336, | |
| "step": 20 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0006473190203905491, | |
| "grad_norm": 0.11079321237999702, | |
| "kl": 0.0237884521484375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.001, | |
| "reward": 0.2894679084420204, | |
| "reward_std": 0.03523569507524371, | |
| "rewards/clip_reward": 0.2894679084420204, | |
| "step": 21 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.000678143735647242, | |
| "grad_norm": 0.12194111403192436, | |
| "kl": 0.02587890625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.001, | |
| "reward": 0.2820267304778099, | |
| "reward_std": 0.040216268971562386, | |
| "rewards/clip_reward": 0.2820267304778099, | |
| "step": 22 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0007089684509039348, | |
| "grad_norm": 0.14764947004412698, | |
| "kl": 0.030364990234375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0012, | |
| "reward": 0.30307962000370026, | |
| "reward_std": 0.0421298248693347, | |
| "rewards/clip_reward": 0.30307962000370026, | |
| "step": 23 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0007397931661606276, | |
| "grad_norm": 0.13740957875900825, | |
| "kl": 0.029144287109375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0012, | |
| "reward": 0.30139467120170593, | |
| "reward_std": 0.03585191536694765, | |
| "rewards/clip_reward": 0.30139467120170593, | |
| "step": 24 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0007706178814173204, | |
| "grad_norm": 0.1589748523019112, | |
| "kl": 0.030548095703125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0012, | |
| "reward": 0.28962523490190506, | |
| "reward_std": 0.03408448817208409, | |
| "rewards/clip_reward": 0.28962523490190506, | |
| "step": 25 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0008014425966740132, | |
| "grad_norm": 0.12748131467646742, | |
| "kl": 0.023712158203125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.001, | |
| "reward": 0.2800466865301132, | |
| "reward_std": 0.03166115842759609, | |
| "rewards/clip_reward": 0.2800466865301132, | |
| "step": 26 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.000832267311930706, | |
| "grad_norm": 0.10841245478899014, | |
| "kl": 0.02532958984375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.001, | |
| "reward": 0.277116097509861, | |
| "reward_std": 0.038001535926014185, | |
| "rewards/clip_reward": 0.277116097509861, | |
| "step": 27 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0008630920271873989, | |
| "grad_norm": 0.12709001525834415, | |
| "kl": 0.02606201171875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.001, | |
| "reward": 0.27604615688323975, | |
| "reward_std": 0.033548878505825996, | |
| "rewards/clip_reward": 0.27604615688323975, | |
| "step": 28 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0008939167424440917, | |
| "grad_norm": 0.21267185014697698, | |
| "kl": 0.036163330078125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0014, | |
| "reward": 0.28682367503643036, | |
| "reward_std": 0.04210791550576687, | |
| "rewards/clip_reward": 0.28682367503643036, | |
| "step": 29 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0009247414577007845, | |
| "grad_norm": 0.18532382846975015, | |
| "kl": 0.033203125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0013, | |
| "reward": 0.2868439294397831, | |
| "reward_std": 0.03913262952119112, | |
| "rewards/clip_reward": 0.2868439294397831, | |
| "step": 30 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0009555661729574773, | |
| "grad_norm": 0.12356243000047522, | |
| "kl": 0.03118896484375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0012, | |
| "reward": 0.2863647863268852, | |
| "reward_std": 0.04316131863743067, | |
| "rewards/clip_reward": 0.2863647863268852, | |
| "step": 31 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0009863908882141701, | |
| "grad_norm": 0.110931968286718, | |
| "kl": 0.027008056640625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0011, | |
| "reward": 0.2939675599336624, | |
| "reward_std": 0.039077806286513805, | |
| "rewards/clip_reward": 0.2939675599336624, | |
| "step": 32 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.001017215603470863, | |
| "grad_norm": 0.2545347541932697, | |
| "kl": 0.02960205078125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0012, | |
| "reward": 0.2887604981660843, | |
| "reward_std": 0.043353252578526735, | |
| "rewards/clip_reward": 0.2887604981660843, | |
| "step": 33 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0010480403187275557, | |
| "grad_norm": 0.12476611864849307, | |
| "kl": 0.035308837890625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0014, | |
| "reward": 0.2868190184235573, | |
| "reward_std": 0.04044362064450979, | |
| "rewards/clip_reward": 0.2868190184235573, | |
| "step": 34 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0010788650339842486, | |
| "grad_norm": 0.12751972659201788, | |
| "kl": 0.031951904296875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0013, | |
| "reward": 0.2924063131213188, | |
| "reward_std": 0.03874353598803282, | |
| "rewards/clip_reward": 0.2924063131213188, | |
| "step": 35 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0011096897492409413, | |
| "grad_norm": 0.11592072388218817, | |
| "kl": 0.033538818359375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0013, | |
| "reward": 0.3099871575832367, | |
| "reward_std": 0.040243714582175016, | |
| "rewards/clip_reward": 0.3099871575832367, | |
| "step": 36 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0011405144644976342, | |
| "grad_norm": 0.11691018858992809, | |
| "kl": 0.028228759765625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0011, | |
| "reward": 0.29740212112665176, | |
| "reward_std": 0.03775101434439421, | |
| "rewards/clip_reward": 0.29740212112665176, | |
| "step": 37 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.001171339179754327, | |
| "grad_norm": 0.12694146379271082, | |
| "kl": 0.029876708984375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0012, | |
| "reward": 0.29666946083307266, | |
| "reward_std": 0.03896902687847614, | |
| "rewards/clip_reward": 0.29666946083307266, | |
| "step": 38 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0012021638950110197, | |
| "grad_norm": 0.11382088518050484, | |
| "kl": 0.034820556640625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0014, | |
| "reward": 0.25845085084438324, | |
| "reward_std": 0.035137762781232595, | |
| "rewards/clip_reward": 0.25845085084438324, | |
| "step": 39 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0012329886102677126, | |
| "grad_norm": 0.11785591889735988, | |
| "kl": 0.03558349609375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0014, | |
| "reward": 0.3000144585967064, | |
| "reward_std": 0.041856614872813225, | |
| "rewards/clip_reward": 0.3000144585967064, | |
| "step": 40 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0012638133255244055, | |
| "grad_norm": 0.14974893291010552, | |
| "kl": 0.03472900390625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0014, | |
| "reward": 0.292422890663147, | |
| "reward_std": 0.03956524468958378, | |
| "rewards/clip_reward": 0.292422890663147, | |
| "step": 41 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0012946380407810982, | |
| "grad_norm": 0.12625257582654123, | |
| "kl": 0.043701171875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0017, | |
| "reward": 0.2874472737312317, | |
| "reward_std": 0.03826928976923227, | |
| "rewards/clip_reward": 0.2874472737312317, | |
| "step": 42 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.001325462756037791, | |
| "grad_norm": 0.15391621735175193, | |
| "kl": 0.052886962890625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0021, | |
| "reward": 0.29409360885620117, | |
| "reward_std": 0.03215181827545166, | |
| "rewards/clip_reward": 0.29409360885620117, | |
| "step": 43 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.001356287471294484, | |
| "grad_norm": 0.11856390130857818, | |
| "kl": 0.02789306640625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0011, | |
| "reward": 0.3074764534831047, | |
| "reward_std": 0.03519732179120183, | |
| "rewards/clip_reward": 0.3074764534831047, | |
| "step": 44 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0013871121865511767, | |
| "grad_norm": 0.11525897073099471, | |
| "kl": 0.02886962890625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0012, | |
| "reward": 0.290186382830143, | |
| "reward_std": 0.042675744742155075, | |
| "rewards/clip_reward": 0.290186382830143, | |
| "step": 45 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0014179369018078695, | |
| "grad_norm": 0.11130066543273066, | |
| "kl": 0.031402587890625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0013, | |
| "reward": 0.2904057502746582, | |
| "reward_std": 0.03623047983273864, | |
| "rewards/clip_reward": 0.2904057502746582, | |
| "step": 46 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0014487616170645624, | |
| "grad_norm": 0.12239293095726622, | |
| "kl": 0.037353515625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0015, | |
| "reward": 0.30481819808483124, | |
| "reward_std": 0.032313164323568344, | |
| "rewards/clip_reward": 0.30481819808483124, | |
| "step": 47 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0014795863323212551, | |
| "grad_norm": 0.12373536144300279, | |
| "kl": 0.03485107421875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0014, | |
| "reward": 0.29724979400634766, | |
| "reward_std": 0.03943999111652374, | |
| "rewards/clip_reward": 0.29724979400634766, | |
| "step": 48 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.001510411047577948, | |
| "grad_norm": 0.1159028647129839, | |
| "kl": 0.03643798828125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0015, | |
| "reward": 0.29280055314302444, | |
| "reward_std": 0.037895018234848976, | |
| "rewards/clip_reward": 0.29280055314302444, | |
| "step": 49 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.001541235762834641, | |
| "grad_norm": 0.11547994419061709, | |
| "kl": 0.027984619140625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0011, | |
| "reward": 0.2938268706202507, | |
| "reward_std": 0.03788345959037542, | |
| "rewards/clip_reward": 0.2938268706202507, | |
| "step": 50 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0015720604780913336, | |
| "grad_norm": 1.6877147367273317, | |
| "kl": 0.1016845703125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0041, | |
| "reward": 0.29082879424095154, | |
| "reward_std": 0.039864601101726294, | |
| "rewards/clip_reward": 0.29082879424095154, | |
| "step": 51 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0016028851933480265, | |
| "grad_norm": 0.21224329234905434, | |
| "kl": 0.060516357421875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0024, | |
| "reward": 0.29952527582645416, | |
| "reward_std": 0.029881142545491457, | |
| "rewards/clip_reward": 0.29952527582645416, | |
| "step": 52 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0016337099086047194, | |
| "grad_norm": 0.11290164286969186, | |
| "kl": 0.029541015625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0012, | |
| "reward": 0.2998390942811966, | |
| "reward_std": 0.035071507561951876, | |
| "rewards/clip_reward": 0.2998390942811966, | |
| "step": 53 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.001664534623861412, | |
| "grad_norm": 0.11400285072826415, | |
| "kl": 0.027587890625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0011, | |
| "reward": 0.29009225964546204, | |
| "reward_std": 0.03698861412703991, | |
| "rewards/clip_reward": 0.29009225964546204, | |
| "step": 54 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.001695359339118105, | |
| "grad_norm": 0.13065371322254837, | |
| "kl": 0.028961181640625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0012, | |
| "reward": 0.3071891888976097, | |
| "reward_std": 0.029143241234123707, | |
| "rewards/clip_reward": 0.3071891888976097, | |
| "step": 55 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0017261840543747978, | |
| "grad_norm": 0.1129906144070696, | |
| "kl": 0.02569580078125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.001, | |
| "reward": 0.29338589310646057, | |
| "reward_std": 0.03269250225275755, | |
| "rewards/clip_reward": 0.29338589310646057, | |
| "step": 56 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0017570087696314905, | |
| "grad_norm": 0.11727642814637977, | |
| "kl": 0.0286865234375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0011, | |
| "reward": 0.31817278265953064, | |
| "reward_std": 0.03473840607330203, | |
| "rewards/clip_reward": 0.31817278265953064, | |
| "step": 57 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0017878334848881834, | |
| "grad_norm": 0.13164061182282957, | |
| "kl": 0.02825927734375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0011, | |
| "reward": 0.2971828728914261, | |
| "reward_std": 0.03734842874109745, | |
| "rewards/clip_reward": 0.2971828728914261, | |
| "step": 58 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.001818658200144876, | |
| "grad_norm": 0.11765242827689301, | |
| "kl": 0.029510498046875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0012, | |
| "reward": 0.31315483897924423, | |
| "reward_std": 0.03106481023132801, | |
| "rewards/clip_reward": 0.31315483897924423, | |
| "step": 59 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.001849482915401569, | |
| "grad_norm": 0.12677244684117328, | |
| "kl": 0.023529052734375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0009, | |
| "reward": 0.3002154156565666, | |
| "reward_std": 0.03606006037443876, | |
| "rewards/clip_reward": 0.3002154156565666, | |
| "step": 60 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0018803076306582618, | |
| "grad_norm": 0.136174367743151, | |
| "kl": 0.034423828125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0014, | |
| "reward": 0.2926482856273651, | |
| "reward_std": 0.03695660084486008, | |
| "rewards/clip_reward": 0.2926482856273651, | |
| "step": 61 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0019111323459149545, | |
| "grad_norm": 0.11353122868188088, | |
| "kl": 0.025970458984375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.001, | |
| "reward": 0.29593927413225174, | |
| "reward_std": 0.03822559863328934, | |
| "rewards/clip_reward": 0.29593927413225174, | |
| "step": 62 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0019419570611716474, | |
| "grad_norm": 0.11947371382126508, | |
| "kl": 0.02886962890625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0012, | |
| "reward": 0.2990872785449028, | |
| "reward_std": 0.03435507323592901, | |
| "rewards/clip_reward": 0.2990872785449028, | |
| "step": 63 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0019727817764283403, | |
| "grad_norm": 0.10960459564919689, | |
| "kl": 0.027374267578125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0011, | |
| "reward": 0.2825084328651428, | |
| "reward_std": 0.037316225469112396, | |
| "rewards/clip_reward": 0.2825084328651428, | |
| "step": 64 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.002003606491685033, | |
| "grad_norm": 0.10946023679973685, | |
| "kl": 0.024322509765625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.001, | |
| "reward": 0.3126995787024498, | |
| "reward_std": 0.035230320412665606, | |
| "rewards/clip_reward": 0.3126995787024498, | |
| "step": 65 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.002034431206941726, | |
| "grad_norm": 0.10983182888889798, | |
| "kl": 0.0341796875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0014, | |
| "reward": 0.2903076857328415, | |
| "reward_std": 0.03482948988676071, | |
| "rewards/clip_reward": 0.2903076857328415, | |
| "step": 66 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0020652559221984185, | |
| "grad_norm": 0.11287812931379468, | |
| "kl": 0.02862548828125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0011, | |
| "reward": 0.30673525482416153, | |
| "reward_std": 0.03648731391876936, | |
| "rewards/clip_reward": 0.30673525482416153, | |
| "step": 67 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0020960806374551114, | |
| "grad_norm": 0.1116123252766076, | |
| "kl": 0.028045654296875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0011, | |
| "reward": 0.3118920400738716, | |
| "reward_std": 0.04191158525645733, | |
| "rewards/clip_reward": 0.3118920400738716, | |
| "step": 68 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0021269053527118043, | |
| "grad_norm": 0.13046284746258094, | |
| "kl": 0.0286865234375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0011, | |
| "reward": 0.3048221841454506, | |
| "reward_std": 0.03315945668146014, | |
| "rewards/clip_reward": 0.3048221841454506, | |
| "step": 69 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.002157730067968497, | |
| "grad_norm": 0.12197157089162045, | |
| "kl": 0.025115966796875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.001, | |
| "reward": 0.2929798662662506, | |
| "reward_std": 0.03310262132436037, | |
| "rewards/clip_reward": 0.2929798662662506, | |
| "step": 70 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.00218855478322519, | |
| "grad_norm": 0.10833880759656236, | |
| "kl": 0.02850341796875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0011, | |
| "reward": 0.2930976450443268, | |
| "reward_std": 0.03318624943494797, | |
| "rewards/clip_reward": 0.2930976450443268, | |
| "step": 71 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0022193794984818826, | |
| "grad_norm": 0.11609773141793645, | |
| "kl": 0.02789306640625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0011, | |
| "reward": 0.2967787832021713, | |
| "reward_std": 0.0352731691673398, | |
| "rewards/clip_reward": 0.2967787832021713, | |
| "step": 72 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0022502042137385755, | |
| "grad_norm": 0.10524474043777819, | |
| "kl": 0.02496337890625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.001, | |
| "reward": 0.31397951394319534, | |
| "reward_std": 0.03302141930907965, | |
| "rewards/clip_reward": 0.31397951394319534, | |
| "step": 73 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0022810289289952683, | |
| "grad_norm": 0.11645830499407812, | |
| "kl": 0.03143310546875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0013, | |
| "reward": 0.304028183221817, | |
| "reward_std": 0.03270072164013982, | |
| "rewards/clip_reward": 0.304028183221817, | |
| "step": 74 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0023118536442519612, | |
| "grad_norm": 0.11708657920762414, | |
| "kl": 0.0277099609375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0011, | |
| "reward": 0.29009127616882324, | |
| "reward_std": 0.034541524946689606, | |
| "rewards/clip_reward": 0.29009127616882324, | |
| "step": 75 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.002342678359508654, | |
| "grad_norm": 0.1474963588837119, | |
| "kl": 0.029022216796875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0012, | |
| "reward": 0.3042903319001198, | |
| "reward_std": 0.03551435098052025, | |
| "rewards/clip_reward": 0.3042903319001198, | |
| "step": 76 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.002373503074765347, | |
| "grad_norm": 0.11094369240221044, | |
| "kl": 0.03167724609375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0013, | |
| "reward": 0.3129897713661194, | |
| "reward_std": 0.030104911886155605, | |
| "rewards/clip_reward": 0.3129897713661194, | |
| "step": 77 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0024043277900220395, | |
| "grad_norm": 0.11276532754709433, | |
| "kl": 0.029083251953125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0012, | |
| "reward": 0.28792131692171097, | |
| "reward_std": 0.03244967618957162, | |
| "rewards/clip_reward": 0.28792131692171097, | |
| "step": 78 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0024351525052787324, | |
| "grad_norm": 2.4407080756052175, | |
| "kl": 0.2354736328125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0094, | |
| "reward": 0.3102322220802307, | |
| "reward_std": 0.0350488992407918, | |
| "rewards/clip_reward": 0.3102322220802307, | |
| "step": 79 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0024659772205354253, | |
| "grad_norm": 0.1159358540029852, | |
| "kl": 0.035400390625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0014, | |
| "reward": 0.30823151022195816, | |
| "reward_std": 0.039351899176836014, | |
| "rewards/clip_reward": 0.30823151022195816, | |
| "step": 80 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.002496801935792118, | |
| "grad_norm": 0.12888152498248232, | |
| "kl": 0.027679443359375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0011, | |
| "reward": 0.31155603379011154, | |
| "reward_std": 0.03785201674327254, | |
| "rewards/clip_reward": 0.31155603379011154, | |
| "step": 81 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.002527626651048811, | |
| "grad_norm": 0.118057549023165, | |
| "kl": 0.031951904296875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0013, | |
| "reward": 0.2901509776711464, | |
| "reward_std": 0.03677979623898864, | |
| "rewards/clip_reward": 0.2901509776711464, | |
| "step": 82 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.002558451366305504, | |
| "grad_norm": 0.13671900392730388, | |
| "kl": 0.03436279296875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0014, | |
| "reward": 0.31552984565496445, | |
| "reward_std": 0.03665575571358204, | |
| "rewards/clip_reward": 0.31552984565496445, | |
| "step": 83 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0025892760815621964, | |
| "grad_norm": 0.1338150548332209, | |
| "kl": 0.02691650390625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0011, | |
| "reward": 0.29462432861328125, | |
| "reward_std": 0.030553956981748343, | |
| "rewards/clip_reward": 0.29462432861328125, | |
| "step": 84 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0026201007968188893, | |
| "grad_norm": 0.11938476789667336, | |
| "kl": 0.035247802734375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0014, | |
| "reward": 0.29117922484874725, | |
| "reward_std": 0.034703842364251614, | |
| "rewards/clip_reward": 0.29117922484874725, | |
| "step": 85 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.002650925512075582, | |
| "grad_norm": 0.15290800659433915, | |
| "kl": 0.02923583984375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0012, | |
| "reward": 0.30999132990837097, | |
| "reward_std": 0.03528518043458462, | |
| "rewards/clip_reward": 0.30999132990837097, | |
| "step": 86 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.002681750227332275, | |
| "grad_norm": 0.1901908492516557, | |
| "kl": 0.035125732421875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0014, | |
| "reward": 0.30646923929452896, | |
| "reward_std": 0.030330040026456118, | |
| "rewards/clip_reward": 0.30646923929452896, | |
| "step": 87 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.002712574942588968, | |
| "grad_norm": 0.11770721369651402, | |
| "kl": 0.033416748046875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0013, | |
| "reward": 0.29444392770528793, | |
| "reward_std": 0.03116408735513687, | |
| "rewards/clip_reward": 0.29444392770528793, | |
| "step": 88 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.002743399657845661, | |
| "grad_norm": 0.1826885288603463, | |
| "kl": 0.03265380859375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0013, | |
| "reward": 0.2949918806552887, | |
| "reward_std": 0.034480467438697815, | |
| "rewards/clip_reward": 0.2949918806552887, | |
| "step": 89 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0027742243731023533, | |
| "grad_norm": 0.12056299378953052, | |
| "kl": 0.03338623046875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0013, | |
| "reward": 0.31496553868055344, | |
| "reward_std": 0.03079960821196437, | |
| "rewards/clip_reward": 0.31496553868055344, | |
| "step": 90 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.002805049088359046, | |
| "grad_norm": 0.12457250512365349, | |
| "kl": 0.03619384765625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0014, | |
| "reward": 0.28923606872558594, | |
| "reward_std": 0.03267038939520717, | |
| "rewards/clip_reward": 0.28923606872558594, | |
| "step": 91 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.002835873803615739, | |
| "grad_norm": 1.5672006078532907, | |
| "kl": 0.07281494140625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0029, | |
| "reward": 0.3001748248934746, | |
| "reward_std": 0.03475807560607791, | |
| "rewards/clip_reward": 0.3001748248934746, | |
| "step": 92 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.002866698518872432, | |
| "grad_norm": 0.14261331472528152, | |
| "kl": 0.032684326171875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0013, | |
| "reward": 0.323921799659729, | |
| "reward_std": 0.03504910413175821, | |
| "rewards/clip_reward": 0.323921799659729, | |
| "step": 93 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.002897523234129125, | |
| "grad_norm": 0.11567854576839746, | |
| "kl": 0.0318603515625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0013, | |
| "reward": 0.3107759431004524, | |
| "reward_std": 0.03745063720270991, | |
| "rewards/clip_reward": 0.3107759431004524, | |
| "step": 94 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0029283479493858173, | |
| "grad_norm": 0.11598622472023176, | |
| "kl": 0.030853271484375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0012, | |
| "reward": 0.3175903409719467, | |
| "reward_std": 0.03149988315999508, | |
| "rewards/clip_reward": 0.3175903409719467, | |
| "step": 95 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0029591726646425102, | |
| "grad_norm": 0.1254209118683483, | |
| "kl": 0.032806396484375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0013, | |
| "reward": 0.28729958087205887, | |
| "reward_std": 0.031386380549520254, | |
| "rewards/clip_reward": 0.28729958087205887, | |
| "step": 96 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.002989997379899203, | |
| "grad_norm": 0.12174331071691413, | |
| "kl": 0.0316162109375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0013, | |
| "reward": 0.3118669241666794, | |
| "reward_std": 0.032158670015633106, | |
| "rewards/clip_reward": 0.3118669241666794, | |
| "step": 97 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.003020822095155896, | |
| "grad_norm": 0.11712521290025418, | |
| "kl": 0.03448486328125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0014, | |
| "reward": 0.31112753599882126, | |
| "reward_std": 0.043876828625798225, | |
| "rewards/clip_reward": 0.31112753599882126, | |
| "step": 98 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.003051646810412589, | |
| "grad_norm": 0.11859882642032223, | |
| "kl": 0.03875732421875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0016, | |
| "reward": 0.2820296436548233, | |
| "reward_std": 0.03406182769685984, | |
| "rewards/clip_reward": 0.2820296436548233, | |
| "step": 99 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.003082471525669282, | |
| "grad_norm": 0.12885755957691103, | |
| "kl": 0.03619384765625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0014, | |
| "reward": 0.30988558381795883, | |
| "reward_std": 0.03780778869986534, | |
| "rewards/clip_reward": 0.30988558381795883, | |
| "step": 100 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0031132962409259743, | |
| "grad_norm": 0.11306253389811041, | |
| "kl": 0.033660888671875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0013, | |
| "reward": 0.3102700859308243, | |
| "reward_std": 0.03908272087574005, | |
| "rewards/clip_reward": 0.3102700859308243, | |
| "step": 101 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.003144120956182667, | |
| "grad_norm": 0.11386105699536472, | |
| "kl": 0.0308837890625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0012, | |
| "reward": 0.29308290779590607, | |
| "reward_std": 0.03458858421072364, | |
| "rewards/clip_reward": 0.29308290779590607, | |
| "step": 102 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.00317494567143936, | |
| "grad_norm": 0.10250552377032608, | |
| "kl": 0.035369873046875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0014, | |
| "reward": 0.31484246253967285, | |
| "reward_std": 0.03848233912140131, | |
| "rewards/clip_reward": 0.31484246253967285, | |
| "step": 103 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.003205770386696053, | |
| "grad_norm": 0.11041408780399448, | |
| "kl": 0.03509521484375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0014, | |
| "reward": 0.30977974832057953, | |
| "reward_std": 0.0354487132281065, | |
| "rewards/clip_reward": 0.30977974832057953, | |
| "step": 104 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.003236595101952746, | |
| "grad_norm": 0.11590179364539747, | |
| "kl": 0.0321044921875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0013, | |
| "reward": 0.3202349618077278, | |
| "reward_std": 0.03078141063451767, | |
| "rewards/clip_reward": 0.3202349618077278, | |
| "step": 105 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0032674198172094387, | |
| "grad_norm": 0.14734135195006995, | |
| "kl": 0.035980224609375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0014, | |
| "reward": 0.3130447119474411, | |
| "reward_std": 0.031586550641804934, | |
| "rewards/clip_reward": 0.3130447119474411, | |
| "step": 106 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.003298244532466131, | |
| "grad_norm": 0.11436474499458421, | |
| "kl": 0.03759765625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0015, | |
| "reward": 0.2865230664610863, | |
| "reward_std": 0.027899319771677256, | |
| "rewards/clip_reward": 0.2865230664610863, | |
| "step": 107 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.003329069247722824, | |
| "grad_norm": 0.10968741552838084, | |
| "kl": 0.031463623046875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0013, | |
| "reward": 0.2957867458462715, | |
| "reward_std": 0.0352176409214735, | |
| "rewards/clip_reward": 0.2957867458462715, | |
| "step": 108 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.003359893962979517, | |
| "grad_norm": 0.12582085065454826, | |
| "kl": 0.030670166015625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0012, | |
| "reward": 0.3016505390405655, | |
| "reward_std": 0.04240915086120367, | |
| "rewards/clip_reward": 0.3016505390405655, | |
| "step": 109 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.00339071867823621, | |
| "grad_norm": 0.10773491440317534, | |
| "kl": 0.03265380859375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0013, | |
| "reward": 0.2948762997984886, | |
| "reward_std": 0.034737172070890665, | |
| "rewards/clip_reward": 0.2948762997984886, | |
| "step": 110 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0034215433934929027, | |
| "grad_norm": 0.10632490654255654, | |
| "kl": 0.03411865234375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0014, | |
| "reward": 0.2969469800591469, | |
| "reward_std": 0.03294783923774958, | |
| "rewards/clip_reward": 0.2969469800591469, | |
| "step": 111 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0034523681087495956, | |
| "grad_norm": 0.10780628812986831, | |
| "kl": 0.03314208984375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0013, | |
| "reward": 0.3059050217270851, | |
| "reward_std": 0.035596927627921104, | |
| "rewards/clip_reward": 0.3059050217270851, | |
| "step": 112 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.003483192824006288, | |
| "grad_norm": 0.10992023386661232, | |
| "kl": 0.0360107421875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0014, | |
| "reward": 0.3140244632959366, | |
| "reward_std": 0.031090704258531332, | |
| "rewards/clip_reward": 0.3140244632959366, | |
| "step": 113 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.003514017539262981, | |
| "grad_norm": 0.10972697905672188, | |
| "kl": 0.0379638671875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0015, | |
| "reward": 0.31535517424345016, | |
| "reward_std": 0.03570608049631119, | |
| "rewards/clip_reward": 0.31535517424345016, | |
| "step": 114 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.003544842254519674, | |
| "grad_norm": 0.11267471386768459, | |
| "kl": 0.03448486328125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0014, | |
| "reward": 0.2879750058054924, | |
| "reward_std": 0.04104418680071831, | |
| "rewards/clip_reward": 0.2879750058054924, | |
| "step": 115 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0035756669697763668, | |
| "grad_norm": 0.11162685314945246, | |
| "kl": 0.038330078125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0015, | |
| "reward": 0.2966529279947281, | |
| "reward_std": 0.03232752811163664, | |
| "rewards/clip_reward": 0.2966529279947281, | |
| "step": 116 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0036064916850330597, | |
| "grad_norm": 0.1078402292948255, | |
| "kl": 0.03717041015625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0015, | |
| "reward": 0.3015007972717285, | |
| "reward_std": 0.03359420504420996, | |
| "rewards/clip_reward": 0.3015007972717285, | |
| "step": 117 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.003637316400289752, | |
| "grad_norm": 0.11308735376651692, | |
| "kl": 0.03997802734375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0016, | |
| "reward": 0.2964186370372772, | |
| "reward_std": 0.02699094917625189, | |
| "rewards/clip_reward": 0.2964186370372772, | |
| "step": 118 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.003668141115546445, | |
| "grad_norm": 0.11055310636563517, | |
| "kl": 0.036590576171875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0015, | |
| "reward": 0.305886946618557, | |
| "reward_std": 0.03268259018659592, | |
| "rewards/clip_reward": 0.305886946618557, | |
| "step": 119 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.003698965830803138, | |
| "grad_norm": 0.10880733102325107, | |
| "kl": 0.04144287109375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0017, | |
| "reward": 0.3160470202565193, | |
| "reward_std": 0.03552013309672475, | |
| "rewards/clip_reward": 0.3160470202565193, | |
| "step": 120 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.003729790546059831, | |
| "grad_norm": 0.1084319678958757, | |
| "kl": 0.03759765625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0015, | |
| "reward": 0.30664851516485214, | |
| "reward_std": 0.03642770275473595, | |
| "rewards/clip_reward": 0.30664851516485214, | |
| "step": 121 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0037606152613165237, | |
| "grad_norm": 0.10716440463937717, | |
| "kl": 0.03912353515625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0016, | |
| "reward": 0.3061741515994072, | |
| "reward_std": 0.03255116753280163, | |
| "rewards/clip_reward": 0.3061741515994072, | |
| "step": 122 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0037914399765732166, | |
| "grad_norm": 0.10921412355349838, | |
| "kl": 0.04193115234375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0017, | |
| "reward": 0.3017224445939064, | |
| "reward_std": 0.035058747977018356, | |
| "rewards/clip_reward": 0.3017224445939064, | |
| "step": 123 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.003822264691829909, | |
| "grad_norm": 0.10891562876371483, | |
| "kl": 0.04351806640625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0017, | |
| "reward": 0.3088080510497093, | |
| "reward_std": 0.03202015720307827, | |
| "rewards/clip_reward": 0.3088080510497093, | |
| "step": 124 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.003853089407086602, | |
| "grad_norm": 0.12400681002324238, | |
| "kl": 0.0430908203125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0017, | |
| "reward": 0.30519475787878036, | |
| "reward_std": 0.04081529099494219, | |
| "rewards/clip_reward": 0.30519475787878036, | |
| "step": 125 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.003883914122343295, | |
| "grad_norm": 0.10684242248297056, | |
| "kl": 0.03955078125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0016, | |
| "reward": 0.2999924272298813, | |
| "reward_std": 0.027399181853979826, | |
| "rewards/clip_reward": 0.2999924272298813, | |
| "step": 126 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.003914738837599988, | |
| "grad_norm": 0.13383282223405826, | |
| "kl": 0.043212890625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0017, | |
| "reward": 0.29980389028787613, | |
| "reward_std": 0.02915497263893485, | |
| "rewards/clip_reward": 0.29980389028787613, | |
| "step": 127 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.003945563552856681, | |
| "grad_norm": 0.10680237923679747, | |
| "kl": 0.0423583984375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0017, | |
| "reward": 0.3113924115896225, | |
| "reward_std": 0.034374223090708256, | |
| "rewards/clip_reward": 0.3113924115896225, | |
| "step": 128 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0039763882681133735, | |
| "grad_norm": 0.11633733033299526, | |
| "kl": 0.0419921875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0017, | |
| "reward": 0.3037688434123993, | |
| "reward_std": 0.030008903238922358, | |
| "rewards/clip_reward": 0.3037688434123993, | |
| "step": 129 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.004007212983370066, | |
| "grad_norm": 0.10824185375409869, | |
| "kl": 0.043212890625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0017, | |
| "reward": 0.3037646785378456, | |
| "reward_std": 0.03128322120755911, | |
| "rewards/clip_reward": 0.3037646785378456, | |
| "step": 130 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.004038037698626759, | |
| "grad_norm": 0.11498093646788146, | |
| "kl": 0.04241943359375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0017, | |
| "reward": 0.3007878288626671, | |
| "reward_std": 0.035366450902074575, | |
| "rewards/clip_reward": 0.3007878288626671, | |
| "step": 131 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.004068862413883452, | |
| "grad_norm": 0.10988923049203525, | |
| "kl": 0.04412841796875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0018, | |
| "reward": 0.2908458858728409, | |
| "reward_std": 0.037285988219082355, | |
| "rewards/clip_reward": 0.2908458858728409, | |
| "step": 132 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.004099687129140144, | |
| "grad_norm": 0.11966799658540882, | |
| "kl": 0.0472412109375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0019, | |
| "reward": 0.31911730766296387, | |
| "reward_std": 0.03753689955919981, | |
| "rewards/clip_reward": 0.31911730766296387, | |
| "step": 133 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.004130511844396837, | |
| "grad_norm": 0.11694661938666558, | |
| "kl": 0.0423583984375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0017, | |
| "reward": 0.2962675616145134, | |
| "reward_std": 0.03178291115909815, | |
| "rewards/clip_reward": 0.2962675616145134, | |
| "step": 134 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.00416133655965353, | |
| "grad_norm": 0.12087212424369766, | |
| "kl": 0.04498291015625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0018, | |
| "reward": 0.2907417491078377, | |
| "reward_std": 0.039098432287573814, | |
| "rewards/clip_reward": 0.2907417491078377, | |
| "step": 135 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.004192161274910223, | |
| "grad_norm": 0.1287243807650102, | |
| "kl": 0.04473876953125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0018, | |
| "reward": 0.3177812397480011, | |
| "reward_std": 0.03574381256476045, | |
| "rewards/clip_reward": 0.3177812397480011, | |
| "step": 136 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.004222985990166916, | |
| "grad_norm": 0.10737365706668978, | |
| "kl": 0.04510498046875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0018, | |
| "reward": 0.3081594184041023, | |
| "reward_std": 0.038351588882505894, | |
| "rewards/clip_reward": 0.3081594184041023, | |
| "step": 137 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.004253810705423609, | |
| "grad_norm": 0.10905175589597935, | |
| "kl": 0.04962158203125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.002, | |
| "reward": 0.3101942911744118, | |
| "reward_std": 0.036289566196501255, | |
| "rewards/clip_reward": 0.3101942911744118, | |
| "step": 138 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0042846354206803015, | |
| "grad_norm": 0.11276085217923483, | |
| "kl": 0.045166015625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0018, | |
| "reward": 0.30623695254325867, | |
| "reward_std": 0.03592977672815323, | |
| "rewards/clip_reward": 0.30623695254325867, | |
| "step": 139 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.004315460135936994, | |
| "grad_norm": 0.11293325800639886, | |
| "kl": 0.04168701171875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0017, | |
| "reward": 0.30889374017715454, | |
| "reward_std": 0.037142093293368816, | |
| "rewards/clip_reward": 0.30889374017715454, | |
| "step": 140 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.004346284851193687, | |
| "grad_norm": 0.11777523910607922, | |
| "kl": 0.0472412109375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0019, | |
| "reward": 0.3147361949086189, | |
| "reward_std": 0.03566309390589595, | |
| "rewards/clip_reward": 0.3147361949086189, | |
| "step": 141 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.00437710956645038, | |
| "grad_norm": 0.11847683399423899, | |
| "kl": 0.04412841796875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0018, | |
| "reward": 0.30750318616628647, | |
| "reward_std": 0.031021112576127052, | |
| "rewards/clip_reward": 0.30750318616628647, | |
| "step": 142 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.004407934281707073, | |
| "grad_norm": 0.2325766643168979, | |
| "kl": 0.04681396484375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0019, | |
| "reward": 0.3154006227850914, | |
| "reward_std": 0.03396408865228295, | |
| "rewards/clip_reward": 0.3154006227850914, | |
| "step": 143 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.004438758996963765, | |
| "grad_norm": 0.12205440467431385, | |
| "kl": 0.04742431640625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0019, | |
| "reward": 0.3048614263534546, | |
| "reward_std": 0.03609074279665947, | |
| "rewards/clip_reward": 0.3048614263534546, | |
| "step": 144 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.004469583712220458, | |
| "grad_norm": 0.23235604214068906, | |
| "kl": 0.04681396484375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0019, | |
| "reward": 0.311428502202034, | |
| "reward_std": 0.03360119927674532, | |
| "rewards/clip_reward": 0.311428502202034, | |
| "step": 145 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.004500408427477151, | |
| "grad_norm": 0.10888274692722734, | |
| "kl": 0.04388427734375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0018, | |
| "reward": 0.31328508257865906, | |
| "reward_std": 0.03548012813553214, | |
| "rewards/clip_reward": 0.31328508257865906, | |
| "step": 146 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.004531233142733844, | |
| "grad_norm": 0.10880868321131348, | |
| "kl": 0.04425048828125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0018, | |
| "reward": 0.3104088082909584, | |
| "reward_std": 0.03424055827781558, | |
| "rewards/clip_reward": 0.3104088082909584, | |
| "step": 147 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.004562057857990537, | |
| "grad_norm": 0.13184897587638666, | |
| "kl": 0.03912353515625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0016, | |
| "reward": 0.32070276886224747, | |
| "reward_std": 0.035730645060539246, | |
| "rewards/clip_reward": 0.32070276886224747, | |
| "step": 148 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.00459288257324723, | |
| "grad_norm": 0.13040013350171695, | |
| "kl": 0.036773681640625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0015, | |
| "reward": 0.28690390288829803, | |
| "reward_std": 0.031756586860865355, | |
| "rewards/clip_reward": 0.28690390288829803, | |
| "step": 149 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0046237072885039225, | |
| "grad_norm": 0.11413869565578771, | |
| "kl": 0.0396728515625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0016, | |
| "reward": 0.29507312178611755, | |
| "reward_std": 0.03319581504911184, | |
| "rewards/clip_reward": 0.29507312178611755, | |
| "step": 150 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.004654532003760615, | |
| "grad_norm": 0.11075253321749193, | |
| "kl": 0.04193115234375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0017, | |
| "reward": 0.3052366375923157, | |
| "reward_std": 0.03602536814287305, | |
| "rewards/clip_reward": 0.3052366375923157, | |
| "step": 151 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.004685356719017308, | |
| "grad_norm": 0.10754829754075829, | |
| "kl": 0.0465087890625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0019, | |
| "reward": 0.32933981716632843, | |
| "reward_std": 0.03219308517873287, | |
| "rewards/clip_reward": 0.32933981716632843, | |
| "step": 152 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.004716181434274001, | |
| "grad_norm": 0.11311221875180785, | |
| "kl": 0.03924560546875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0016, | |
| "reward": 0.32051652669906616, | |
| "reward_std": 0.03876081760972738, | |
| "rewards/clip_reward": 0.32051652669906616, | |
| "step": 153 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.004747006149530694, | |
| "grad_norm": 0.10485333011345627, | |
| "kl": 0.04144287109375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0017, | |
| "reward": 0.3038826063275337, | |
| "reward_std": 0.03334263851866126, | |
| "rewards/clip_reward": 0.3038826063275337, | |
| "step": 154 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.004777830864787387, | |
| "grad_norm": 0.2148098423660176, | |
| "kl": 0.05029296875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.002, | |
| "reward": 0.3050876185297966, | |
| "reward_std": 0.03197276359423995, | |
| "rewards/clip_reward": 0.3050876185297966, | |
| "step": 155 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.004808655580044079, | |
| "grad_norm": 0.11844316361896051, | |
| "kl": 0.03863525390625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0015, | |
| "reward": 0.32097869366407394, | |
| "reward_std": 0.029324380215257406, | |
| "rewards/clip_reward": 0.32097869366407394, | |
| "step": 156 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.004839480295300772, | |
| "grad_norm": 0.11792606634997942, | |
| "kl": 0.04254150390625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0017, | |
| "reward": 0.3132959380745888, | |
| "reward_std": 0.027422321029007435, | |
| "rewards/clip_reward": 0.3132959380745888, | |
| "step": 157 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.004870305010557465, | |
| "grad_norm": 0.11837395235594217, | |
| "kl": 0.0396728515625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0016, | |
| "reward": 0.3073809891939163, | |
| "reward_std": 0.03928511310368776, | |
| "rewards/clip_reward": 0.3073809891939163, | |
| "step": 158 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.004901129725814158, | |
| "grad_norm": 0.11129310536332628, | |
| "kl": 0.0430908203125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0017, | |
| "reward": 0.2984026074409485, | |
| "reward_std": 0.03139211004599929, | |
| "rewards/clip_reward": 0.2984026074409485, | |
| "step": 159 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0049319544410708505, | |
| "grad_norm": 0.11532428624713585, | |
| "kl": 0.043212890625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0017, | |
| "reward": 0.3008668124675751, | |
| "reward_std": 0.02627889020368457, | |
| "rewards/clip_reward": 0.3008668124675751, | |
| "step": 160 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.004962779156327543, | |
| "grad_norm": 0.11786021221814753, | |
| "kl": 0.049560546875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.002, | |
| "reward": 0.28665469214320183, | |
| "reward_std": 0.03704976849257946, | |
| "rewards/clip_reward": 0.28665469214320183, | |
| "step": 161 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.004993603871584236, | |
| "grad_norm": 0.11010075273074264, | |
| "kl": 0.0458984375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0018, | |
| "reward": 0.3088128864765167, | |
| "reward_std": 0.03618460427969694, | |
| "rewards/clip_reward": 0.3088128864765167, | |
| "step": 162 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.005024428586840929, | |
| "grad_norm": 0.13239934947372414, | |
| "kl": 0.04388427734375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0018, | |
| "reward": 0.3036596402525902, | |
| "reward_std": 0.032796021085232496, | |
| "rewards/clip_reward": 0.3036596402525902, | |
| "step": 163 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.005055253302097622, | |
| "grad_norm": 0.11413695831587789, | |
| "kl": 0.04144287109375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0017, | |
| "reward": 0.304002046585083, | |
| "reward_std": 0.029863339848816395, | |
| "rewards/clip_reward": 0.304002046585083, | |
| "step": 164 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.005086078017354315, | |
| "grad_norm": 0.11564845823686422, | |
| "kl": 0.04547119140625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0018, | |
| "reward": 0.29191841185092926, | |
| "reward_std": 0.035626471508294344, | |
| "rewards/clip_reward": 0.29191841185092926, | |
| "step": 165 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.005116902732611008, | |
| "grad_norm": 0.11329154232078235, | |
| "kl": 0.0458984375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0018, | |
| "reward": 0.3206318989396095, | |
| "reward_std": 0.031096128281205893, | |
| "rewards/clip_reward": 0.3206318989396095, | |
| "step": 166 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0051477274478677, | |
| "grad_norm": 0.13085517741110167, | |
| "kl": 0.04510498046875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0018, | |
| "reward": 0.28686685115098953, | |
| "reward_std": 0.028212732169777155, | |
| "rewards/clip_reward": 0.28686685115098953, | |
| "step": 167 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.005178552163124393, | |
| "grad_norm": 0.35649465481543857, | |
| "kl": 0.05712890625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0023, | |
| "reward": 0.3119603246450424, | |
| "reward_std": 0.03256976744160056, | |
| "rewards/clip_reward": 0.3119603246450424, | |
| "step": 168 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.005209376878381086, | |
| "grad_norm": 0.13826424691903896, | |
| "kl": 0.047607421875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0019, | |
| "reward": 0.2963334918022156, | |
| "reward_std": 0.035677722189575434, | |
| "rewards/clip_reward": 0.2963334918022156, | |
| "step": 169 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.005240201593637779, | |
| "grad_norm": 0.1534567095384766, | |
| "kl": 0.05511474609375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0022, | |
| "reward": 0.3064122945070267, | |
| "reward_std": 0.030035972129553556, | |
| "rewards/clip_reward": 0.3064122945070267, | |
| "step": 170 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0052710263088944715, | |
| "grad_norm": 0.14535366087971044, | |
| "kl": 0.0533447265625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0021, | |
| "reward": 0.31411340832710266, | |
| "reward_std": 0.03346576215699315, | |
| "rewards/clip_reward": 0.31411340832710266, | |
| "step": 171 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.005301851024151164, | |
| "grad_norm": 0.11182081037914528, | |
| "kl": 0.04864501953125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0019, | |
| "reward": 0.31372761726379395, | |
| "reward_std": 0.028209302574396133, | |
| "rewards/clip_reward": 0.31372761726379395, | |
| "step": 172 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.005332675739407857, | |
| "grad_norm": 0.10998805611249249, | |
| "kl": 0.0506591796875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.002, | |
| "reward": 0.3076253980398178, | |
| "reward_std": 0.03530415939167142, | |
| "rewards/clip_reward": 0.3076253980398178, | |
| "step": 173 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.00536350045466455, | |
| "grad_norm": 0.11782096135818707, | |
| "kl": 0.04974365234375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.002, | |
| "reward": 0.31246717274188995, | |
| "reward_std": 0.029364202171564102, | |
| "rewards/clip_reward": 0.31246717274188995, | |
| "step": 174 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.005394325169921243, | |
| "grad_norm": 0.12510210623580764, | |
| "kl": 0.045166015625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0018, | |
| "reward": 0.3149753659963608, | |
| "reward_std": 0.03486820124089718, | |
| "rewards/clip_reward": 0.3149753659963608, | |
| "step": 175 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.005425149885177936, | |
| "grad_norm": 0.12073650226365598, | |
| "kl": 0.04754638671875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0019, | |
| "reward": 0.3087178245186806, | |
| "reward_std": 0.03398646041750908, | |
| "rewards/clip_reward": 0.3087178245186806, | |
| "step": 176 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.005455974600434629, | |
| "grad_norm": 0.16141440704371318, | |
| "kl": 0.046630859375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0019, | |
| "reward": 0.32408736646175385, | |
| "reward_std": 0.03735980670899153, | |
| "rewards/clip_reward": 0.32408736646175385, | |
| "step": 177 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.005486799315691322, | |
| "grad_norm": 0.12441494367513423, | |
| "kl": 0.04547119140625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0018, | |
| "reward": 0.32129447907209396, | |
| "reward_std": 0.02921806275844574, | |
| "rewards/clip_reward": 0.32129447907209396, | |
| "step": 178 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.005517624030948014, | |
| "grad_norm": 0.11472079048722336, | |
| "kl": 0.04193115234375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0017, | |
| "reward": 0.3252818211913109, | |
| "reward_std": 0.032147477846592665, | |
| "rewards/clip_reward": 0.3252818211913109, | |
| "step": 179 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.005548448746204707, | |
| "grad_norm": 0.11643678981280302, | |
| "kl": 0.04559326171875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0018, | |
| "reward": 0.31385669857263565, | |
| "reward_std": 0.031697194557636976, | |
| "rewards/clip_reward": 0.31385669857263565, | |
| "step": 180 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0055792734614613995, | |
| "grad_norm": 0.10985906766240239, | |
| "kl": 0.0438232421875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0018, | |
| "reward": 0.3218560591340065, | |
| "reward_std": 0.030952177941799164, | |
| "rewards/clip_reward": 0.3218560591340065, | |
| "step": 181 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.005610098176718092, | |
| "grad_norm": 0.12318124273748278, | |
| "kl": 0.044189453125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0018, | |
| "reward": 0.3146945610642433, | |
| "reward_std": 0.037078809924423695, | |
| "rewards/clip_reward": 0.3146945610642433, | |
| "step": 182 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.005640922891974785, | |
| "grad_norm": 0.10855110593795812, | |
| "kl": 0.0469970703125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0019, | |
| "reward": 0.3247036188840866, | |
| "reward_std": 0.03742914833128452, | |
| "rewards/clip_reward": 0.3247036188840866, | |
| "step": 183 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.005671747607231478, | |
| "grad_norm": 0.12463338581811485, | |
| "kl": 0.04730224609375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0019, | |
| "reward": 0.3007904663681984, | |
| "reward_std": 0.030957046430557966, | |
| "rewards/clip_reward": 0.3007904663681984, | |
| "step": 184 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.005702572322488171, | |
| "grad_norm": 0.11738351599765742, | |
| "kl": 0.04608154296875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0018, | |
| "reward": 0.3152424767613411, | |
| "reward_std": 0.032630473375320435, | |
| "rewards/clip_reward": 0.3152424767613411, | |
| "step": 185 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.005733397037744864, | |
| "grad_norm": 0.12425233709466331, | |
| "kl": 0.0426025390625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0017, | |
| "reward": 0.3105937987565994, | |
| "reward_std": 0.037436836399137974, | |
| "rewards/clip_reward": 0.3105937987565994, | |
| "step": 186 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.005764221753001557, | |
| "grad_norm": 0.24540233073575124, | |
| "kl": 0.0604248046875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0024, | |
| "reward": 0.3193615674972534, | |
| "reward_std": 0.02923456858843565, | |
| "rewards/clip_reward": 0.3193615674972534, | |
| "step": 187 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.00579504646825825, | |
| "grad_norm": 0.11122419357690229, | |
| "kl": 0.04248046875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0017, | |
| "reward": 0.29561520367860794, | |
| "reward_std": 0.03093513334169984, | |
| "rewards/clip_reward": 0.29561520367860794, | |
| "step": 188 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.005825871183514943, | |
| "grad_norm": 0.11723471576199568, | |
| "kl": 0.04541015625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0018, | |
| "reward": 0.3073701113462448, | |
| "reward_std": 0.03550923429429531, | |
| "rewards/clip_reward": 0.3073701113462448, | |
| "step": 189 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.005856695898771635, | |
| "grad_norm": 0.10846043413336356, | |
| "kl": 0.04443359375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0018, | |
| "reward": 0.3202974200248718, | |
| "reward_std": 0.03128951042890549, | |
| "rewards/clip_reward": 0.3202974200248718, | |
| "step": 190 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.005887520614028328, | |
| "grad_norm": 0.10684544727987075, | |
| "kl": 0.04400634765625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0018, | |
| "reward": 0.3150642290711403, | |
| "reward_std": 0.038218459114432335, | |
| "rewards/clip_reward": 0.3150642290711403, | |
| "step": 191 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0059183453292850205, | |
| "grad_norm": 0.12179014361836414, | |
| "kl": 0.0406494140625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0016, | |
| "reward": 0.3023750111460686, | |
| "reward_std": 0.028901703655719757, | |
| "rewards/clip_reward": 0.3023750111460686, | |
| "step": 192 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.005949170044541713, | |
| "grad_norm": 0.11287740721272002, | |
| "kl": 0.04296875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0017, | |
| "reward": 0.30537480115890503, | |
| "reward_std": 0.03340973751619458, | |
| "rewards/clip_reward": 0.30537480115890503, | |
| "step": 193 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.005979994759798406, | |
| "grad_norm": 0.11953948724468283, | |
| "kl": 0.0406494140625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0016, | |
| "reward": 0.2977057322859764, | |
| "reward_std": 0.041334839537739754, | |
| "rewards/clip_reward": 0.2977057322859764, | |
| "step": 194 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.006010819475055099, | |
| "grad_norm": 0.33669753751768144, | |
| "kl": 0.05206298828125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0021, | |
| "reward": 0.3174924701452255, | |
| "reward_std": 0.03484439663589001, | |
| "rewards/clip_reward": 0.3174924701452255, | |
| "step": 195 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.006041644190311792, | |
| "grad_norm": 0.13081401668031936, | |
| "kl": 0.0428466796875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0017, | |
| "reward": 0.31883813440799713, | |
| "reward_std": 0.03002287307754159, | |
| "rewards/clip_reward": 0.31883813440799713, | |
| "step": 196 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.006072468905568485, | |
| "grad_norm": 0.15362262357445108, | |
| "kl": 0.04388427734375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0018, | |
| "reward": 0.3148084282875061, | |
| "reward_std": 0.03563790209591389, | |
| "rewards/clip_reward": 0.3148084282875061, | |
| "step": 197 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.006103293620825178, | |
| "grad_norm": 0.12036606667480174, | |
| "kl": 0.04522705078125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0018, | |
| "reward": 0.3185431882739067, | |
| "reward_std": 0.03547387337312102, | |
| "rewards/clip_reward": 0.3185431882739067, | |
| "step": 198 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.006134118336081871, | |
| "grad_norm": 0.11265487428982754, | |
| "kl": 0.04522705078125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0018, | |
| "reward": 0.3013475388288498, | |
| "reward_std": 0.027531601022928953, | |
| "rewards/clip_reward": 0.3013475388288498, | |
| "step": 199 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.006164943051338564, | |
| "grad_norm": 0.12480397529853451, | |
| "kl": 0.04632568359375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0019, | |
| "reward": 0.2954695001244545, | |
| "reward_std": 0.035041794180870056, | |
| "rewards/clip_reward": 0.2954695001244545, | |
| "step": 200 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0061957677665952565, | |
| "grad_norm": 0.11326384693922859, | |
| "kl": 0.04779052734375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0019, | |
| "reward": 0.30719564110040665, | |
| "reward_std": 0.03018721006810665, | |
| "rewards/clip_reward": 0.30719564110040665, | |
| "step": 201 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0062265924818519485, | |
| "grad_norm": 0.14001014918782298, | |
| "kl": 0.0477294921875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0019, | |
| "reward": 0.27801472693681717, | |
| "reward_std": 0.029184456914663315, | |
| "rewards/clip_reward": 0.27801472693681717, | |
| "step": 202 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.006257417197108641, | |
| "grad_norm": 0.13598377277553347, | |
| "kl": 0.04864501953125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0019, | |
| "reward": 0.31848207861185074, | |
| "reward_std": 0.029095898382365704, | |
| "rewards/clip_reward": 0.31848207861185074, | |
| "step": 203 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.006288241912365334, | |
| "grad_norm": 0.12990427030922783, | |
| "kl": 0.04669189453125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0019, | |
| "reward": 0.3233681470155716, | |
| "reward_std": 0.02884372603148222, | |
| "rewards/clip_reward": 0.3233681470155716, | |
| "step": 204 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.006319066627622027, | |
| "grad_norm": 0.11337348510966652, | |
| "kl": 0.04705810546875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0019, | |
| "reward": 0.30711568146944046, | |
| "reward_std": 0.027397962752729654, | |
| "rewards/clip_reward": 0.30711568146944046, | |
| "step": 205 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.00634989134287872, | |
| "grad_norm": 0.11008239157225132, | |
| "kl": 0.04534912109375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0018, | |
| "reward": 0.31931688636541367, | |
| "reward_std": 0.029018502216786146, | |
| "rewards/clip_reward": 0.31931688636541367, | |
| "step": 206 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.006380716058135413, | |
| "grad_norm": 0.11005906943350524, | |
| "kl": 0.04534912109375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0018, | |
| "reward": 0.2889741063117981, | |
| "reward_std": 0.03417292470112443, | |
| "rewards/clip_reward": 0.2889741063117981, | |
| "step": 207 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.006411540773392106, | |
| "grad_norm": 0.13107793374479376, | |
| "kl": 0.04388427734375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0018, | |
| "reward": 0.316011942923069, | |
| "reward_std": 0.035842195618897676, | |
| "rewards/clip_reward": 0.316011942923069, | |
| "step": 208 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.006442365488648799, | |
| "grad_norm": 0.11921653233574052, | |
| "kl": 0.04827880859375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0019, | |
| "reward": 0.3120528683066368, | |
| "reward_std": 0.03157835826277733, | |
| "rewards/clip_reward": 0.3120528683066368, | |
| "step": 209 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.006473190203905492, | |
| "grad_norm": 0.11671423552831947, | |
| "kl": 0.04705810546875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0019, | |
| "reward": 0.3096734285354614, | |
| "reward_std": 0.02532344777137041, | |
| "rewards/clip_reward": 0.3096734285354614, | |
| "step": 210 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0065040149191621845, | |
| "grad_norm": 0.12478193702072561, | |
| "kl": 0.04876708984375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.002, | |
| "reward": 0.3255714699625969, | |
| "reward_std": 0.03476850828155875, | |
| "rewards/clip_reward": 0.3255714699625969, | |
| "step": 211 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.006534839634418877, | |
| "grad_norm": 0.11234041456760836, | |
| "kl": 0.043701171875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0017, | |
| "reward": 0.3205392137169838, | |
| "reward_std": 0.02879873849451542, | |
| "rewards/clip_reward": 0.3205392137169838, | |
| "step": 212 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0065656643496755695, | |
| "grad_norm": 0.11484489412906124, | |
| "kl": 0.04779052734375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0019, | |
| "reward": 0.30573664605617523, | |
| "reward_std": 0.034792355727404356, | |
| "rewards/clip_reward": 0.30573664605617523, | |
| "step": 213 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.006596489064932262, | |
| "grad_norm": 0.112973116545128, | |
| "kl": 0.0565185546875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0023, | |
| "reward": 0.322362020611763, | |
| "reward_std": 0.040699029341340065, | |
| "rewards/clip_reward": 0.322362020611763, | |
| "step": 214 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.006627313780188955, | |
| "grad_norm": 0.11000967164413539, | |
| "kl": 0.0496826171875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.002, | |
| "reward": 0.32849258929491043, | |
| "reward_std": 0.03333634790033102, | |
| "rewards/clip_reward": 0.32849258929491043, | |
| "step": 215 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.006658138495445648, | |
| "grad_norm": 0.11266446524081561, | |
| "kl": 0.04681396484375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0019, | |
| "reward": 0.3268875405192375, | |
| "reward_std": 0.041359793394804, | |
| "rewards/clip_reward": 0.3268875405192375, | |
| "step": 216 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.006688963210702341, | |
| "grad_norm": 0.1297650863707243, | |
| "kl": 0.05572509765625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0022, | |
| "reward": 0.3042534068226814, | |
| "reward_std": 0.029003426898270845, | |
| "rewards/clip_reward": 0.3042534068226814, | |
| "step": 217 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.006719787925959034, | |
| "grad_norm": 0.1224231057191533, | |
| "kl": 0.0494384765625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.002, | |
| "reward": 0.322785347700119, | |
| "reward_std": 0.03285923460498452, | |
| "rewards/clip_reward": 0.322785347700119, | |
| "step": 218 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.006750612641215727, | |
| "grad_norm": 0.11197370609789116, | |
| "kl": 0.0545654296875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0022, | |
| "reward": 0.3130335509777069, | |
| "reward_std": 0.028378690592944622, | |
| "rewards/clip_reward": 0.3130335509777069, | |
| "step": 219 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.00678143735647242, | |
| "grad_norm": 0.12265292069342315, | |
| "kl": 0.0526123046875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0021, | |
| "reward": 0.3101393133401871, | |
| "reward_std": 0.031088348012417555, | |
| "rewards/clip_reward": 0.3101393133401871, | |
| "step": 220 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.006812262071729113, | |
| "grad_norm": 0.1301967649695606, | |
| "kl": 0.0504150390625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.002, | |
| "reward": 0.3258736953139305, | |
| "reward_std": 0.030232697259634733, | |
| "rewards/clip_reward": 0.3258736953139305, | |
| "step": 221 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0068430867869858055, | |
| "grad_norm": 0.12267903014610237, | |
| "kl": 0.05633544921875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0023, | |
| "reward": 0.3135734647512436, | |
| "reward_std": 0.029037311673164368, | |
| "rewards/clip_reward": 0.3135734647512436, | |
| "step": 222 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.006873911502242498, | |
| "grad_norm": 0.1556499324095671, | |
| "kl": 0.05694580078125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0023, | |
| "reward": 0.3139733672142029, | |
| "reward_std": 0.03242505481466651, | |
| "rewards/clip_reward": 0.3139733672142029, | |
| "step": 223 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.006904736217499191, | |
| "grad_norm": 0.14040668849148338, | |
| "kl": 0.05712890625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0023, | |
| "reward": 0.2902194894850254, | |
| "reward_std": 0.030722644180059433, | |
| "rewards/clip_reward": 0.2902194894850254, | |
| "step": 224 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.006935560932755883, | |
| "grad_norm": 0.12469801158231446, | |
| "kl": 0.05535888671875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0022, | |
| "reward": 0.3171394020318985, | |
| "reward_std": 0.032871958799660206, | |
| "rewards/clip_reward": 0.3171394020318985, | |
| "step": 225 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.006966385648012576, | |
| "grad_norm": 0.11664661161299794, | |
| "kl": 0.0555419921875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0022, | |
| "reward": 0.30288901180028915, | |
| "reward_std": 0.03361931908875704, | |
| "rewards/clip_reward": 0.30288901180028915, | |
| "step": 226 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.006997210363269269, | |
| "grad_norm": 0.12368362093823822, | |
| "kl": 0.0579833984375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0023, | |
| "reward": 0.29918094724416733, | |
| "reward_std": 0.032192114274948835, | |
| "rewards/clip_reward": 0.29918094724416733, | |
| "step": 227 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.007028035078525962, | |
| "grad_norm": 0.13670193873160075, | |
| "kl": 0.0626220703125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0025, | |
| "reward": 0.30772917717695236, | |
| "reward_std": 0.03168489225208759, | |
| "rewards/clip_reward": 0.30772917717695236, | |
| "step": 228 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.007058859793782655, | |
| "grad_norm": 0.15557420264040966, | |
| "kl": 0.053955078125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0022, | |
| "reward": 0.28914518654346466, | |
| "reward_std": 0.03747545275837183, | |
| "rewards/clip_reward": 0.28914518654346466, | |
| "step": 229 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.007089684509039348, | |
| "grad_norm": 0.3182664775944573, | |
| "kl": 0.08154296875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0033, | |
| "reward": 0.311465322971344, | |
| "reward_std": 0.03380277007818222, | |
| "rewards/clip_reward": 0.311465322971344, | |
| "step": 230 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.007120509224296041, | |
| "grad_norm": 0.12693032913934957, | |
| "kl": 0.0616455078125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0025, | |
| "reward": 0.32272306829690933, | |
| "reward_std": 0.03391577862203121, | |
| "rewards/clip_reward": 0.32272306829690933, | |
| "step": 231 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0071513339395527335, | |
| "grad_norm": 4.296122241941977, | |
| "kl": 0.45172119140625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0181, | |
| "reward": 0.32933176308870316, | |
| "reward_std": 0.03596270922571421, | |
| "rewards/clip_reward": 0.32933176308870316, | |
| "step": 232 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.007182158654809426, | |
| "grad_norm": 0.12252452858451943, | |
| "kl": 0.06103515625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0024, | |
| "reward": 0.30073945224285126, | |
| "reward_std": 0.035156805999577045, | |
| "rewards/clip_reward": 0.30073945224285126, | |
| "step": 233 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.007212983370066119, | |
| "grad_norm": 0.18753510109293484, | |
| "kl": 0.0567626953125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0023, | |
| "reward": 0.3166045621037483, | |
| "reward_std": 0.036252960562705994, | |
| "rewards/clip_reward": 0.3166045621037483, | |
| "step": 234 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.007243808085322812, | |
| "grad_norm": 0.12244712083969155, | |
| "kl": 0.0599365234375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0024, | |
| "reward": 0.32098332792520523, | |
| "reward_std": 0.02912920992821455, | |
| "rewards/clip_reward": 0.32098332792520523, | |
| "step": 235 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.007274632800579504, | |
| "grad_norm": 0.1153924535202381, | |
| "kl": 0.057373046875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0023, | |
| "reward": 0.30156801640987396, | |
| "reward_std": 0.03506749775260687, | |
| "rewards/clip_reward": 0.30156801640987396, | |
| "step": 236 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.007305457515836197, | |
| "grad_norm": 982.1130719581894, | |
| "kl": 11.4212646484375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.4579, | |
| "reward": 0.3112705275416374, | |
| "reward_std": 0.03293308801949024, | |
| "rewards/clip_reward": 0.3112705275416374, | |
| "step": 237 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.00733628223109289, | |
| "grad_norm": 0.13912658954576582, | |
| "kl": 0.06134033203125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0025, | |
| "reward": 0.31086497753858566, | |
| "reward_std": 0.03602492017671466, | |
| "rewards/clip_reward": 0.31086497753858566, | |
| "step": 238 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.007367106946349583, | |
| "grad_norm": 0.15900496360182007, | |
| "kl": 0.072021484375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0029, | |
| "reward": 0.301338754594326, | |
| "reward_std": 0.03604850126430392, | |
| "rewards/clip_reward": 0.301338754594326, | |
| "step": 239 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.007397931661606276, | |
| "grad_norm": 0.11682552010464033, | |
| "kl": 0.069091796875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0028, | |
| "reward": 0.310367226600647, | |
| "reward_std": 0.033799303229898214, | |
| "rewards/clip_reward": 0.310367226600647, | |
| "step": 240 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.007428756376862969, | |
| "grad_norm": 0.6484753019850781, | |
| "kl": 0.1041259765625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0042, | |
| "reward": 0.32271357625722885, | |
| "reward_std": 0.03783900523558259, | |
| "rewards/clip_reward": 0.32271357625722885, | |
| "step": 241 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.007459581092119662, | |
| "grad_norm": 0.12488153334727711, | |
| "kl": 0.06195068359375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0025, | |
| "reward": 0.30252621322870255, | |
| "reward_std": 0.03500718716531992, | |
| "rewards/clip_reward": 0.30252621322870255, | |
| "step": 242 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.0074904058073763545, | |
| "grad_norm": 0.11793262109697311, | |
| "kl": 0.0692138671875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0028, | |
| "reward": 0.3169676512479782, | |
| "reward_std": 0.03632183885201812, | |
| "rewards/clip_reward": 0.3169676512479782, | |
| "step": 243 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.007521230522633047, | |
| "grad_norm": 0.11916442015495367, | |
| "kl": 0.06756591796875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0027, | |
| "reward": 0.32120058685541153, | |
| "reward_std": 0.03682229993864894, | |
| "rewards/clip_reward": 0.32120058685541153, | |
| "step": 244 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.00755205523788974, | |
| "grad_norm": 0.13178282787519513, | |
| "kl": 0.068115234375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0027, | |
| "reward": 0.3041011393070221, | |
| "reward_std": 0.03216708730906248, | |
| "rewards/clip_reward": 0.3041011393070221, | |
| "step": 245 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.007582879953146433, | |
| "grad_norm": 0.1209756865386705, | |
| "kl": 0.07470703125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.003, | |
| "reward": 0.30233804881572723, | |
| "reward_std": 0.03544025868177414, | |
| "rewards/clip_reward": 0.30233804881572723, | |
| "step": 246 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.007613704668403126, | |
| "grad_norm": 0.14259183948300264, | |
| "kl": 0.082275390625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0033, | |
| "reward": 0.297327883541584, | |
| "reward_std": 0.03194120712578297, | |
| "rewards/clip_reward": 0.297327883541584, | |
| "step": 247 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.007644529383659818, | |
| "grad_norm": 0.11966570448950067, | |
| "kl": 0.07708740234375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0031, | |
| "reward": 0.3017484247684479, | |
| "reward_std": 0.03380720689892769, | |
| "rewards/clip_reward": 0.3017484247684479, | |
| "step": 248 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.007675354098916511, | |
| "grad_norm": 0.11584912623106094, | |
| "kl": 0.079345703125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0032, | |
| "reward": 0.2936403974890709, | |
| "reward_std": 0.03349851304665208, | |
| "rewards/clip_reward": 0.2936403974890709, | |
| "step": 249 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 4096.0, | |
| "epoch": 0.007706178814173204, | |
| "grad_norm": 0.13824008661423945, | |
| "kl": 0.0780029296875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0031, | |
| "reward": 0.3258681297302246, | |
| "reward_std": 0.04007330071181059, | |
| "rewards/clip_reward": 0.3258681297302246, | |
| "step": 250 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 32441, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 10, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 6, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |