SimpleAR-1.5B-RL / trainer_state.json
Daniel0724's picture
Upload folder using huggingface_hub
a1bc9bc verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.007706178814173204,
"eval_steps": 500,
"global_step": 250,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 3.082471525669282e-05,
"grad_norm": 0.13662848638776823,
"kl": 0.0,
"learning_rate": 1e-05,
"loss": 0.0,
"reward": 0.2896246537566185,
"reward_std": 0.043548169545829296,
"rewards/clip_reward": 0.2896246537566185,
"step": 1
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 6.164943051338563e-05,
"grad_norm": 0.1178878537123743,
"kl": 0.0007257461547851562,
"learning_rate": 1e-05,
"loss": 0.0,
"reward": 0.28567346930503845,
"reward_std": 0.04437257535755634,
"rewards/clip_reward": 0.28567346930503845,
"step": 2
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 9.247414577007844e-05,
"grad_norm": 0.3404139762424654,
"kl": 0.0029239654541015625,
"learning_rate": 1e-05,
"loss": 0.0001,
"reward": 0.26198844239115715,
"reward_std": 0.03637277893722057,
"rewards/clip_reward": 0.26198844239115715,
"step": 3
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.00012329886102677127,
"grad_norm": 0.21757091715643603,
"kl": 0.001544952392578125,
"learning_rate": 1e-05,
"loss": 0.0001,
"reward": 0.2846095412969589,
"reward_std": 0.03777279099449515,
"rewards/clip_reward": 0.2846095412969589,
"step": 4
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.00015412357628346408,
"grad_norm": 0.1348527934598492,
"kl": 0.002452850341796875,
"learning_rate": 1e-05,
"loss": 0.0001,
"reward": 0.23306814581155777,
"reward_std": 0.033804881386458874,
"rewards/clip_reward": 0.23306814581155777,
"step": 5
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0001849482915401569,
"grad_norm": 0.14393009622892527,
"kl": 0.00441741943359375,
"learning_rate": 1e-05,
"loss": 0.0002,
"reward": 0.2847321555018425,
"reward_std": 0.040111628361046314,
"rewards/clip_reward": 0.2847321555018425,
"step": 6
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.00021577300679684973,
"grad_norm": 0.14351852552309294,
"kl": 0.00428009033203125,
"learning_rate": 1e-05,
"loss": 0.0002,
"reward": 0.2802872806787491,
"reward_std": 0.0383415911346674,
"rewards/clip_reward": 0.2802872806787491,
"step": 7
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.00024659772205354254,
"grad_norm": 0.16776667321961602,
"kl": 0.00494384765625,
"learning_rate": 1e-05,
"loss": 0.0002,
"reward": 0.2881240174174309,
"reward_std": 0.04321274207904935,
"rewards/clip_reward": 0.2881240174174309,
"step": 8
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0002774224373102353,
"grad_norm": 0.13329464822711182,
"kl": 0.009368896484375,
"learning_rate": 1e-05,
"loss": 0.0004,
"reward": 0.2757921889424324,
"reward_std": 0.042105874978005886,
"rewards/clip_reward": 0.2757921889424324,
"step": 9
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.00030824715256692816,
"grad_norm": 0.12887604952918402,
"kl": 0.00988006591796875,
"learning_rate": 1e-05,
"loss": 0.0004,
"reward": 0.287712462246418,
"reward_std": 0.03645364008843899,
"rewards/clip_reward": 0.287712462246418,
"step": 10
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.000339071867823621,
"grad_norm": 0.1231047237735355,
"kl": 0.0100250244140625,
"learning_rate": 1e-05,
"loss": 0.0004,
"reward": 0.24786356836557388,
"reward_std": 0.036720491014420986,
"rewards/clip_reward": 0.24786356836557388,
"step": 11
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0003698965830803138,
"grad_norm": 0.3414993544210762,
"kl": 0.01570892333984375,
"learning_rate": 1e-05,
"loss": 0.0006,
"reward": 0.31398245692253113,
"reward_std": 0.05272817797958851,
"rewards/clip_reward": 0.31398245692253113,
"step": 12
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0004007212983370066,
"grad_norm": 0.13599349338370845,
"kl": 0.014190673828125,
"learning_rate": 1e-05,
"loss": 0.0006,
"reward": 0.28073475882411003,
"reward_std": 0.04901007656008005,
"rewards/clip_reward": 0.28073475882411003,
"step": 13
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.00043154601359369945,
"grad_norm": 0.127187147587617,
"kl": 0.0131378173828125,
"learning_rate": 1e-05,
"loss": 0.0005,
"reward": 0.280636228621006,
"reward_std": 0.039431299082934856,
"rewards/clip_reward": 0.280636228621006,
"step": 14
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.00046237072885039224,
"grad_norm": 0.13215878836878708,
"kl": 0.0188140869140625,
"learning_rate": 1e-05,
"loss": 0.0008,
"reward": 0.29201044142246246,
"reward_std": 0.035117349587380886,
"rewards/clip_reward": 0.29201044142246246,
"step": 15
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0004931954441070851,
"grad_norm": 0.12256785878118313,
"kl": 0.022613525390625,
"learning_rate": 1e-05,
"loss": 0.0009,
"reward": 0.2798103988170624,
"reward_std": 0.03762377658858895,
"rewards/clip_reward": 0.2798103988170624,
"step": 16
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0005240201593637779,
"grad_norm": 0.19802700549242463,
"kl": 0.02978515625,
"learning_rate": 1e-05,
"loss": 0.0012,
"reward": 0.27924566715955734,
"reward_std": 0.04653105605393648,
"rewards/clip_reward": 0.27924566715955734,
"step": 17
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0005548448746204706,
"grad_norm": 0.16347182492777684,
"kl": 0.020050048828125,
"learning_rate": 1e-05,
"loss": 0.0008,
"reward": 0.2851836755871773,
"reward_std": 0.034420196898281574,
"rewards/clip_reward": 0.2851836755871773,
"step": 18
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0005856695898771635,
"grad_norm": 0.1227504829797276,
"kl": 0.02288818359375,
"learning_rate": 1e-05,
"loss": 0.0009,
"reward": 0.3039173483848572,
"reward_std": 0.04395513795316219,
"rewards/clip_reward": 0.3039173483848572,
"step": 19
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0006164943051338563,
"grad_norm": 0.11185292631745263,
"kl": 0.01983642578125,
"learning_rate": 1e-05,
"loss": 0.0008,
"reward": 0.2992554157972336,
"reward_std": 0.043223864398896694,
"rewards/clip_reward": 0.2992554157972336,
"step": 20
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0006473190203905491,
"grad_norm": 0.11079321237999702,
"kl": 0.0237884521484375,
"learning_rate": 1e-05,
"loss": 0.001,
"reward": 0.2894679084420204,
"reward_std": 0.03523569507524371,
"rewards/clip_reward": 0.2894679084420204,
"step": 21
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.000678143735647242,
"grad_norm": 0.12194111403192436,
"kl": 0.02587890625,
"learning_rate": 1e-05,
"loss": 0.001,
"reward": 0.2820267304778099,
"reward_std": 0.040216268971562386,
"rewards/clip_reward": 0.2820267304778099,
"step": 22
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0007089684509039348,
"grad_norm": 0.14764947004412698,
"kl": 0.030364990234375,
"learning_rate": 1e-05,
"loss": 0.0012,
"reward": 0.30307962000370026,
"reward_std": 0.0421298248693347,
"rewards/clip_reward": 0.30307962000370026,
"step": 23
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0007397931661606276,
"grad_norm": 0.13740957875900825,
"kl": 0.029144287109375,
"learning_rate": 1e-05,
"loss": 0.0012,
"reward": 0.30139467120170593,
"reward_std": 0.03585191536694765,
"rewards/clip_reward": 0.30139467120170593,
"step": 24
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0007706178814173204,
"grad_norm": 0.1589748523019112,
"kl": 0.030548095703125,
"learning_rate": 1e-05,
"loss": 0.0012,
"reward": 0.28962523490190506,
"reward_std": 0.03408448817208409,
"rewards/clip_reward": 0.28962523490190506,
"step": 25
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0008014425966740132,
"grad_norm": 0.12748131467646742,
"kl": 0.023712158203125,
"learning_rate": 1e-05,
"loss": 0.001,
"reward": 0.2800466865301132,
"reward_std": 0.03166115842759609,
"rewards/clip_reward": 0.2800466865301132,
"step": 26
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.000832267311930706,
"grad_norm": 0.10841245478899014,
"kl": 0.02532958984375,
"learning_rate": 1e-05,
"loss": 0.001,
"reward": 0.277116097509861,
"reward_std": 0.038001535926014185,
"rewards/clip_reward": 0.277116097509861,
"step": 27
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0008630920271873989,
"grad_norm": 0.12709001525834415,
"kl": 0.02606201171875,
"learning_rate": 1e-05,
"loss": 0.001,
"reward": 0.27604615688323975,
"reward_std": 0.033548878505825996,
"rewards/clip_reward": 0.27604615688323975,
"step": 28
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0008939167424440917,
"grad_norm": 0.21267185014697698,
"kl": 0.036163330078125,
"learning_rate": 1e-05,
"loss": 0.0014,
"reward": 0.28682367503643036,
"reward_std": 0.04210791550576687,
"rewards/clip_reward": 0.28682367503643036,
"step": 29
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0009247414577007845,
"grad_norm": 0.18532382846975015,
"kl": 0.033203125,
"learning_rate": 1e-05,
"loss": 0.0013,
"reward": 0.2868439294397831,
"reward_std": 0.03913262952119112,
"rewards/clip_reward": 0.2868439294397831,
"step": 30
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0009555661729574773,
"grad_norm": 0.12356243000047522,
"kl": 0.03118896484375,
"learning_rate": 1e-05,
"loss": 0.0012,
"reward": 0.2863647863268852,
"reward_std": 0.04316131863743067,
"rewards/clip_reward": 0.2863647863268852,
"step": 31
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0009863908882141701,
"grad_norm": 0.110931968286718,
"kl": 0.027008056640625,
"learning_rate": 1e-05,
"loss": 0.0011,
"reward": 0.2939675599336624,
"reward_std": 0.039077806286513805,
"rewards/clip_reward": 0.2939675599336624,
"step": 32
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.001017215603470863,
"grad_norm": 0.2545347541932697,
"kl": 0.02960205078125,
"learning_rate": 1e-05,
"loss": 0.0012,
"reward": 0.2887604981660843,
"reward_std": 0.043353252578526735,
"rewards/clip_reward": 0.2887604981660843,
"step": 33
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0010480403187275557,
"grad_norm": 0.12476611864849307,
"kl": 0.035308837890625,
"learning_rate": 1e-05,
"loss": 0.0014,
"reward": 0.2868190184235573,
"reward_std": 0.04044362064450979,
"rewards/clip_reward": 0.2868190184235573,
"step": 34
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0010788650339842486,
"grad_norm": 0.12751972659201788,
"kl": 0.031951904296875,
"learning_rate": 1e-05,
"loss": 0.0013,
"reward": 0.2924063131213188,
"reward_std": 0.03874353598803282,
"rewards/clip_reward": 0.2924063131213188,
"step": 35
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0011096897492409413,
"grad_norm": 0.11592072388218817,
"kl": 0.033538818359375,
"learning_rate": 1e-05,
"loss": 0.0013,
"reward": 0.3099871575832367,
"reward_std": 0.040243714582175016,
"rewards/clip_reward": 0.3099871575832367,
"step": 36
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0011405144644976342,
"grad_norm": 0.11691018858992809,
"kl": 0.028228759765625,
"learning_rate": 1e-05,
"loss": 0.0011,
"reward": 0.29740212112665176,
"reward_std": 0.03775101434439421,
"rewards/clip_reward": 0.29740212112665176,
"step": 37
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.001171339179754327,
"grad_norm": 0.12694146379271082,
"kl": 0.029876708984375,
"learning_rate": 1e-05,
"loss": 0.0012,
"reward": 0.29666946083307266,
"reward_std": 0.03896902687847614,
"rewards/clip_reward": 0.29666946083307266,
"step": 38
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0012021638950110197,
"grad_norm": 0.11382088518050484,
"kl": 0.034820556640625,
"learning_rate": 1e-05,
"loss": 0.0014,
"reward": 0.25845085084438324,
"reward_std": 0.035137762781232595,
"rewards/clip_reward": 0.25845085084438324,
"step": 39
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0012329886102677126,
"grad_norm": 0.11785591889735988,
"kl": 0.03558349609375,
"learning_rate": 1e-05,
"loss": 0.0014,
"reward": 0.3000144585967064,
"reward_std": 0.041856614872813225,
"rewards/clip_reward": 0.3000144585967064,
"step": 40
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0012638133255244055,
"grad_norm": 0.14974893291010552,
"kl": 0.03472900390625,
"learning_rate": 1e-05,
"loss": 0.0014,
"reward": 0.292422890663147,
"reward_std": 0.03956524468958378,
"rewards/clip_reward": 0.292422890663147,
"step": 41
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0012946380407810982,
"grad_norm": 0.12625257582654123,
"kl": 0.043701171875,
"learning_rate": 1e-05,
"loss": 0.0017,
"reward": 0.2874472737312317,
"reward_std": 0.03826928976923227,
"rewards/clip_reward": 0.2874472737312317,
"step": 42
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.001325462756037791,
"grad_norm": 0.15391621735175193,
"kl": 0.052886962890625,
"learning_rate": 1e-05,
"loss": 0.0021,
"reward": 0.29409360885620117,
"reward_std": 0.03215181827545166,
"rewards/clip_reward": 0.29409360885620117,
"step": 43
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.001356287471294484,
"grad_norm": 0.11856390130857818,
"kl": 0.02789306640625,
"learning_rate": 1e-05,
"loss": 0.0011,
"reward": 0.3074764534831047,
"reward_std": 0.03519732179120183,
"rewards/clip_reward": 0.3074764534831047,
"step": 44
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0013871121865511767,
"grad_norm": 0.11525897073099471,
"kl": 0.02886962890625,
"learning_rate": 1e-05,
"loss": 0.0012,
"reward": 0.290186382830143,
"reward_std": 0.042675744742155075,
"rewards/clip_reward": 0.290186382830143,
"step": 45
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0014179369018078695,
"grad_norm": 0.11130066543273066,
"kl": 0.031402587890625,
"learning_rate": 1e-05,
"loss": 0.0013,
"reward": 0.2904057502746582,
"reward_std": 0.03623047983273864,
"rewards/clip_reward": 0.2904057502746582,
"step": 46
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0014487616170645624,
"grad_norm": 0.12239293095726622,
"kl": 0.037353515625,
"learning_rate": 1e-05,
"loss": 0.0015,
"reward": 0.30481819808483124,
"reward_std": 0.032313164323568344,
"rewards/clip_reward": 0.30481819808483124,
"step": 47
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0014795863323212551,
"grad_norm": 0.12373536144300279,
"kl": 0.03485107421875,
"learning_rate": 1e-05,
"loss": 0.0014,
"reward": 0.29724979400634766,
"reward_std": 0.03943999111652374,
"rewards/clip_reward": 0.29724979400634766,
"step": 48
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.001510411047577948,
"grad_norm": 0.1159028647129839,
"kl": 0.03643798828125,
"learning_rate": 1e-05,
"loss": 0.0015,
"reward": 0.29280055314302444,
"reward_std": 0.037895018234848976,
"rewards/clip_reward": 0.29280055314302444,
"step": 49
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.001541235762834641,
"grad_norm": 0.11547994419061709,
"kl": 0.027984619140625,
"learning_rate": 1e-05,
"loss": 0.0011,
"reward": 0.2938268706202507,
"reward_std": 0.03788345959037542,
"rewards/clip_reward": 0.2938268706202507,
"step": 50
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0015720604780913336,
"grad_norm": 1.6877147367273317,
"kl": 0.1016845703125,
"learning_rate": 1e-05,
"loss": 0.0041,
"reward": 0.29082879424095154,
"reward_std": 0.039864601101726294,
"rewards/clip_reward": 0.29082879424095154,
"step": 51
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0016028851933480265,
"grad_norm": 0.21224329234905434,
"kl": 0.060516357421875,
"learning_rate": 1e-05,
"loss": 0.0024,
"reward": 0.29952527582645416,
"reward_std": 0.029881142545491457,
"rewards/clip_reward": 0.29952527582645416,
"step": 52
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0016337099086047194,
"grad_norm": 0.11290164286969186,
"kl": 0.029541015625,
"learning_rate": 1e-05,
"loss": 0.0012,
"reward": 0.2998390942811966,
"reward_std": 0.035071507561951876,
"rewards/clip_reward": 0.2998390942811966,
"step": 53
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.001664534623861412,
"grad_norm": 0.11400285072826415,
"kl": 0.027587890625,
"learning_rate": 1e-05,
"loss": 0.0011,
"reward": 0.29009225964546204,
"reward_std": 0.03698861412703991,
"rewards/clip_reward": 0.29009225964546204,
"step": 54
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.001695359339118105,
"grad_norm": 0.13065371322254837,
"kl": 0.028961181640625,
"learning_rate": 1e-05,
"loss": 0.0012,
"reward": 0.3071891888976097,
"reward_std": 0.029143241234123707,
"rewards/clip_reward": 0.3071891888976097,
"step": 55
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0017261840543747978,
"grad_norm": 0.1129906144070696,
"kl": 0.02569580078125,
"learning_rate": 1e-05,
"loss": 0.001,
"reward": 0.29338589310646057,
"reward_std": 0.03269250225275755,
"rewards/clip_reward": 0.29338589310646057,
"step": 56
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0017570087696314905,
"grad_norm": 0.11727642814637977,
"kl": 0.0286865234375,
"learning_rate": 1e-05,
"loss": 0.0011,
"reward": 0.31817278265953064,
"reward_std": 0.03473840607330203,
"rewards/clip_reward": 0.31817278265953064,
"step": 57
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0017878334848881834,
"grad_norm": 0.13164061182282957,
"kl": 0.02825927734375,
"learning_rate": 1e-05,
"loss": 0.0011,
"reward": 0.2971828728914261,
"reward_std": 0.03734842874109745,
"rewards/clip_reward": 0.2971828728914261,
"step": 58
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.001818658200144876,
"grad_norm": 0.11765242827689301,
"kl": 0.029510498046875,
"learning_rate": 1e-05,
"loss": 0.0012,
"reward": 0.31315483897924423,
"reward_std": 0.03106481023132801,
"rewards/clip_reward": 0.31315483897924423,
"step": 59
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.001849482915401569,
"grad_norm": 0.12677244684117328,
"kl": 0.023529052734375,
"learning_rate": 1e-05,
"loss": 0.0009,
"reward": 0.3002154156565666,
"reward_std": 0.03606006037443876,
"rewards/clip_reward": 0.3002154156565666,
"step": 60
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0018803076306582618,
"grad_norm": 0.136174367743151,
"kl": 0.034423828125,
"learning_rate": 1e-05,
"loss": 0.0014,
"reward": 0.2926482856273651,
"reward_std": 0.03695660084486008,
"rewards/clip_reward": 0.2926482856273651,
"step": 61
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0019111323459149545,
"grad_norm": 0.11353122868188088,
"kl": 0.025970458984375,
"learning_rate": 1e-05,
"loss": 0.001,
"reward": 0.29593927413225174,
"reward_std": 0.03822559863328934,
"rewards/clip_reward": 0.29593927413225174,
"step": 62
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0019419570611716474,
"grad_norm": 0.11947371382126508,
"kl": 0.02886962890625,
"learning_rate": 1e-05,
"loss": 0.0012,
"reward": 0.2990872785449028,
"reward_std": 0.03435507323592901,
"rewards/clip_reward": 0.2990872785449028,
"step": 63
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0019727817764283403,
"grad_norm": 0.10960459564919689,
"kl": 0.027374267578125,
"learning_rate": 1e-05,
"loss": 0.0011,
"reward": 0.2825084328651428,
"reward_std": 0.037316225469112396,
"rewards/clip_reward": 0.2825084328651428,
"step": 64
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.002003606491685033,
"grad_norm": 0.10946023679973685,
"kl": 0.024322509765625,
"learning_rate": 1e-05,
"loss": 0.001,
"reward": 0.3126995787024498,
"reward_std": 0.035230320412665606,
"rewards/clip_reward": 0.3126995787024498,
"step": 65
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.002034431206941726,
"grad_norm": 0.10983182888889798,
"kl": 0.0341796875,
"learning_rate": 1e-05,
"loss": 0.0014,
"reward": 0.2903076857328415,
"reward_std": 0.03482948988676071,
"rewards/clip_reward": 0.2903076857328415,
"step": 66
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0020652559221984185,
"grad_norm": 0.11287812931379468,
"kl": 0.02862548828125,
"learning_rate": 1e-05,
"loss": 0.0011,
"reward": 0.30673525482416153,
"reward_std": 0.03648731391876936,
"rewards/clip_reward": 0.30673525482416153,
"step": 67
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0020960806374551114,
"grad_norm": 0.1116123252766076,
"kl": 0.028045654296875,
"learning_rate": 1e-05,
"loss": 0.0011,
"reward": 0.3118920400738716,
"reward_std": 0.04191158525645733,
"rewards/clip_reward": 0.3118920400738716,
"step": 68
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0021269053527118043,
"grad_norm": 0.13046284746258094,
"kl": 0.0286865234375,
"learning_rate": 1e-05,
"loss": 0.0011,
"reward": 0.3048221841454506,
"reward_std": 0.03315945668146014,
"rewards/clip_reward": 0.3048221841454506,
"step": 69
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.002157730067968497,
"grad_norm": 0.12197157089162045,
"kl": 0.025115966796875,
"learning_rate": 1e-05,
"loss": 0.001,
"reward": 0.2929798662662506,
"reward_std": 0.03310262132436037,
"rewards/clip_reward": 0.2929798662662506,
"step": 70
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.00218855478322519,
"grad_norm": 0.10833880759656236,
"kl": 0.02850341796875,
"learning_rate": 1e-05,
"loss": 0.0011,
"reward": 0.2930976450443268,
"reward_std": 0.03318624943494797,
"rewards/clip_reward": 0.2930976450443268,
"step": 71
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0022193794984818826,
"grad_norm": 0.11609773141793645,
"kl": 0.02789306640625,
"learning_rate": 1e-05,
"loss": 0.0011,
"reward": 0.2967787832021713,
"reward_std": 0.0352731691673398,
"rewards/clip_reward": 0.2967787832021713,
"step": 72
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0022502042137385755,
"grad_norm": 0.10524474043777819,
"kl": 0.02496337890625,
"learning_rate": 1e-05,
"loss": 0.001,
"reward": 0.31397951394319534,
"reward_std": 0.03302141930907965,
"rewards/clip_reward": 0.31397951394319534,
"step": 73
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0022810289289952683,
"grad_norm": 0.11645830499407812,
"kl": 0.03143310546875,
"learning_rate": 1e-05,
"loss": 0.0013,
"reward": 0.304028183221817,
"reward_std": 0.03270072164013982,
"rewards/clip_reward": 0.304028183221817,
"step": 74
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0023118536442519612,
"grad_norm": 0.11708657920762414,
"kl": 0.0277099609375,
"learning_rate": 1e-05,
"loss": 0.0011,
"reward": 0.29009127616882324,
"reward_std": 0.034541524946689606,
"rewards/clip_reward": 0.29009127616882324,
"step": 75
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.002342678359508654,
"grad_norm": 0.1474963588837119,
"kl": 0.029022216796875,
"learning_rate": 1e-05,
"loss": 0.0012,
"reward": 0.3042903319001198,
"reward_std": 0.03551435098052025,
"rewards/clip_reward": 0.3042903319001198,
"step": 76
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.002373503074765347,
"grad_norm": 0.11094369240221044,
"kl": 0.03167724609375,
"learning_rate": 1e-05,
"loss": 0.0013,
"reward": 0.3129897713661194,
"reward_std": 0.030104911886155605,
"rewards/clip_reward": 0.3129897713661194,
"step": 77
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0024043277900220395,
"grad_norm": 0.11276532754709433,
"kl": 0.029083251953125,
"learning_rate": 1e-05,
"loss": 0.0012,
"reward": 0.28792131692171097,
"reward_std": 0.03244967618957162,
"rewards/clip_reward": 0.28792131692171097,
"step": 78
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0024351525052787324,
"grad_norm": 2.4407080756052175,
"kl": 0.2354736328125,
"learning_rate": 1e-05,
"loss": 0.0094,
"reward": 0.3102322220802307,
"reward_std": 0.0350488992407918,
"rewards/clip_reward": 0.3102322220802307,
"step": 79
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0024659772205354253,
"grad_norm": 0.1159358540029852,
"kl": 0.035400390625,
"learning_rate": 1e-05,
"loss": 0.0014,
"reward": 0.30823151022195816,
"reward_std": 0.039351899176836014,
"rewards/clip_reward": 0.30823151022195816,
"step": 80
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.002496801935792118,
"grad_norm": 0.12888152498248232,
"kl": 0.027679443359375,
"learning_rate": 1e-05,
"loss": 0.0011,
"reward": 0.31155603379011154,
"reward_std": 0.03785201674327254,
"rewards/clip_reward": 0.31155603379011154,
"step": 81
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.002527626651048811,
"grad_norm": 0.118057549023165,
"kl": 0.031951904296875,
"learning_rate": 1e-05,
"loss": 0.0013,
"reward": 0.2901509776711464,
"reward_std": 0.03677979623898864,
"rewards/clip_reward": 0.2901509776711464,
"step": 82
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.002558451366305504,
"grad_norm": 0.13671900392730388,
"kl": 0.03436279296875,
"learning_rate": 1e-05,
"loss": 0.0014,
"reward": 0.31552984565496445,
"reward_std": 0.03665575571358204,
"rewards/clip_reward": 0.31552984565496445,
"step": 83
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0025892760815621964,
"grad_norm": 0.1338150548332209,
"kl": 0.02691650390625,
"learning_rate": 1e-05,
"loss": 0.0011,
"reward": 0.29462432861328125,
"reward_std": 0.030553956981748343,
"rewards/clip_reward": 0.29462432861328125,
"step": 84
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0026201007968188893,
"grad_norm": 0.11938476789667336,
"kl": 0.035247802734375,
"learning_rate": 1e-05,
"loss": 0.0014,
"reward": 0.29117922484874725,
"reward_std": 0.034703842364251614,
"rewards/clip_reward": 0.29117922484874725,
"step": 85
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.002650925512075582,
"grad_norm": 0.15290800659433915,
"kl": 0.02923583984375,
"learning_rate": 1e-05,
"loss": 0.0012,
"reward": 0.30999132990837097,
"reward_std": 0.03528518043458462,
"rewards/clip_reward": 0.30999132990837097,
"step": 86
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.002681750227332275,
"grad_norm": 0.1901908492516557,
"kl": 0.035125732421875,
"learning_rate": 1e-05,
"loss": 0.0014,
"reward": 0.30646923929452896,
"reward_std": 0.030330040026456118,
"rewards/clip_reward": 0.30646923929452896,
"step": 87
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.002712574942588968,
"grad_norm": 0.11770721369651402,
"kl": 0.033416748046875,
"learning_rate": 1e-05,
"loss": 0.0013,
"reward": 0.29444392770528793,
"reward_std": 0.03116408735513687,
"rewards/clip_reward": 0.29444392770528793,
"step": 88
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.002743399657845661,
"grad_norm": 0.1826885288603463,
"kl": 0.03265380859375,
"learning_rate": 1e-05,
"loss": 0.0013,
"reward": 0.2949918806552887,
"reward_std": 0.034480467438697815,
"rewards/clip_reward": 0.2949918806552887,
"step": 89
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0027742243731023533,
"grad_norm": 0.12056299378953052,
"kl": 0.03338623046875,
"learning_rate": 1e-05,
"loss": 0.0013,
"reward": 0.31496553868055344,
"reward_std": 0.03079960821196437,
"rewards/clip_reward": 0.31496553868055344,
"step": 90
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.002805049088359046,
"grad_norm": 0.12457250512365349,
"kl": 0.03619384765625,
"learning_rate": 1e-05,
"loss": 0.0014,
"reward": 0.28923606872558594,
"reward_std": 0.03267038939520717,
"rewards/clip_reward": 0.28923606872558594,
"step": 91
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.002835873803615739,
"grad_norm": 1.5672006078532907,
"kl": 0.07281494140625,
"learning_rate": 1e-05,
"loss": 0.0029,
"reward": 0.3001748248934746,
"reward_std": 0.03475807560607791,
"rewards/clip_reward": 0.3001748248934746,
"step": 92
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.002866698518872432,
"grad_norm": 0.14261331472528152,
"kl": 0.032684326171875,
"learning_rate": 1e-05,
"loss": 0.0013,
"reward": 0.323921799659729,
"reward_std": 0.03504910413175821,
"rewards/clip_reward": 0.323921799659729,
"step": 93
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.002897523234129125,
"grad_norm": 0.11567854576839746,
"kl": 0.0318603515625,
"learning_rate": 1e-05,
"loss": 0.0013,
"reward": 0.3107759431004524,
"reward_std": 0.03745063720270991,
"rewards/clip_reward": 0.3107759431004524,
"step": 94
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0029283479493858173,
"grad_norm": 0.11598622472023176,
"kl": 0.030853271484375,
"learning_rate": 1e-05,
"loss": 0.0012,
"reward": 0.3175903409719467,
"reward_std": 0.03149988315999508,
"rewards/clip_reward": 0.3175903409719467,
"step": 95
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0029591726646425102,
"grad_norm": 0.1254209118683483,
"kl": 0.032806396484375,
"learning_rate": 1e-05,
"loss": 0.0013,
"reward": 0.28729958087205887,
"reward_std": 0.031386380549520254,
"rewards/clip_reward": 0.28729958087205887,
"step": 96
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.002989997379899203,
"grad_norm": 0.12174331071691413,
"kl": 0.0316162109375,
"learning_rate": 1e-05,
"loss": 0.0013,
"reward": 0.3118669241666794,
"reward_std": 0.032158670015633106,
"rewards/clip_reward": 0.3118669241666794,
"step": 97
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.003020822095155896,
"grad_norm": 0.11712521290025418,
"kl": 0.03448486328125,
"learning_rate": 1e-05,
"loss": 0.0014,
"reward": 0.31112753599882126,
"reward_std": 0.043876828625798225,
"rewards/clip_reward": 0.31112753599882126,
"step": 98
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.003051646810412589,
"grad_norm": 0.11859882642032223,
"kl": 0.03875732421875,
"learning_rate": 1e-05,
"loss": 0.0016,
"reward": 0.2820296436548233,
"reward_std": 0.03406182769685984,
"rewards/clip_reward": 0.2820296436548233,
"step": 99
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.003082471525669282,
"grad_norm": 0.12885755957691103,
"kl": 0.03619384765625,
"learning_rate": 1e-05,
"loss": 0.0014,
"reward": 0.30988558381795883,
"reward_std": 0.03780778869986534,
"rewards/clip_reward": 0.30988558381795883,
"step": 100
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0031132962409259743,
"grad_norm": 0.11306253389811041,
"kl": 0.033660888671875,
"learning_rate": 1e-05,
"loss": 0.0013,
"reward": 0.3102700859308243,
"reward_std": 0.03908272087574005,
"rewards/clip_reward": 0.3102700859308243,
"step": 101
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.003144120956182667,
"grad_norm": 0.11386105699536472,
"kl": 0.0308837890625,
"learning_rate": 1e-05,
"loss": 0.0012,
"reward": 0.29308290779590607,
"reward_std": 0.03458858421072364,
"rewards/clip_reward": 0.29308290779590607,
"step": 102
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.00317494567143936,
"grad_norm": 0.10250552377032608,
"kl": 0.035369873046875,
"learning_rate": 1e-05,
"loss": 0.0014,
"reward": 0.31484246253967285,
"reward_std": 0.03848233912140131,
"rewards/clip_reward": 0.31484246253967285,
"step": 103
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.003205770386696053,
"grad_norm": 0.11041408780399448,
"kl": 0.03509521484375,
"learning_rate": 1e-05,
"loss": 0.0014,
"reward": 0.30977974832057953,
"reward_std": 0.0354487132281065,
"rewards/clip_reward": 0.30977974832057953,
"step": 104
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.003236595101952746,
"grad_norm": 0.11590179364539747,
"kl": 0.0321044921875,
"learning_rate": 1e-05,
"loss": 0.0013,
"reward": 0.3202349618077278,
"reward_std": 0.03078141063451767,
"rewards/clip_reward": 0.3202349618077278,
"step": 105
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0032674198172094387,
"grad_norm": 0.14734135195006995,
"kl": 0.035980224609375,
"learning_rate": 1e-05,
"loss": 0.0014,
"reward": 0.3130447119474411,
"reward_std": 0.031586550641804934,
"rewards/clip_reward": 0.3130447119474411,
"step": 106
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.003298244532466131,
"grad_norm": 0.11436474499458421,
"kl": 0.03759765625,
"learning_rate": 1e-05,
"loss": 0.0015,
"reward": 0.2865230664610863,
"reward_std": 0.027899319771677256,
"rewards/clip_reward": 0.2865230664610863,
"step": 107
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.003329069247722824,
"grad_norm": 0.10968741552838084,
"kl": 0.031463623046875,
"learning_rate": 1e-05,
"loss": 0.0013,
"reward": 0.2957867458462715,
"reward_std": 0.0352176409214735,
"rewards/clip_reward": 0.2957867458462715,
"step": 108
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.003359893962979517,
"grad_norm": 0.12582085065454826,
"kl": 0.030670166015625,
"learning_rate": 1e-05,
"loss": 0.0012,
"reward": 0.3016505390405655,
"reward_std": 0.04240915086120367,
"rewards/clip_reward": 0.3016505390405655,
"step": 109
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.00339071867823621,
"grad_norm": 0.10773491440317534,
"kl": 0.03265380859375,
"learning_rate": 1e-05,
"loss": 0.0013,
"reward": 0.2948762997984886,
"reward_std": 0.034737172070890665,
"rewards/clip_reward": 0.2948762997984886,
"step": 110
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0034215433934929027,
"grad_norm": 0.10632490654255654,
"kl": 0.03411865234375,
"learning_rate": 1e-05,
"loss": 0.0014,
"reward": 0.2969469800591469,
"reward_std": 0.03294783923774958,
"rewards/clip_reward": 0.2969469800591469,
"step": 111
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0034523681087495956,
"grad_norm": 0.10780628812986831,
"kl": 0.03314208984375,
"learning_rate": 1e-05,
"loss": 0.0013,
"reward": 0.3059050217270851,
"reward_std": 0.035596927627921104,
"rewards/clip_reward": 0.3059050217270851,
"step": 112
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.003483192824006288,
"grad_norm": 0.10992023386661232,
"kl": 0.0360107421875,
"learning_rate": 1e-05,
"loss": 0.0014,
"reward": 0.3140244632959366,
"reward_std": 0.031090704258531332,
"rewards/clip_reward": 0.3140244632959366,
"step": 113
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.003514017539262981,
"grad_norm": 0.10972697905672188,
"kl": 0.0379638671875,
"learning_rate": 1e-05,
"loss": 0.0015,
"reward": 0.31535517424345016,
"reward_std": 0.03570608049631119,
"rewards/clip_reward": 0.31535517424345016,
"step": 114
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.003544842254519674,
"grad_norm": 0.11267471386768459,
"kl": 0.03448486328125,
"learning_rate": 1e-05,
"loss": 0.0014,
"reward": 0.2879750058054924,
"reward_std": 0.04104418680071831,
"rewards/clip_reward": 0.2879750058054924,
"step": 115
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0035756669697763668,
"grad_norm": 0.11162685314945246,
"kl": 0.038330078125,
"learning_rate": 1e-05,
"loss": 0.0015,
"reward": 0.2966529279947281,
"reward_std": 0.03232752811163664,
"rewards/clip_reward": 0.2966529279947281,
"step": 116
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0036064916850330597,
"grad_norm": 0.1078402292948255,
"kl": 0.03717041015625,
"learning_rate": 1e-05,
"loss": 0.0015,
"reward": 0.3015007972717285,
"reward_std": 0.03359420504420996,
"rewards/clip_reward": 0.3015007972717285,
"step": 117
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.003637316400289752,
"grad_norm": 0.11308735376651692,
"kl": 0.03997802734375,
"learning_rate": 1e-05,
"loss": 0.0016,
"reward": 0.2964186370372772,
"reward_std": 0.02699094917625189,
"rewards/clip_reward": 0.2964186370372772,
"step": 118
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.003668141115546445,
"grad_norm": 0.11055310636563517,
"kl": 0.036590576171875,
"learning_rate": 1e-05,
"loss": 0.0015,
"reward": 0.305886946618557,
"reward_std": 0.03268259018659592,
"rewards/clip_reward": 0.305886946618557,
"step": 119
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.003698965830803138,
"grad_norm": 0.10880733102325107,
"kl": 0.04144287109375,
"learning_rate": 1e-05,
"loss": 0.0017,
"reward": 0.3160470202565193,
"reward_std": 0.03552013309672475,
"rewards/clip_reward": 0.3160470202565193,
"step": 120
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.003729790546059831,
"grad_norm": 0.1084319678958757,
"kl": 0.03759765625,
"learning_rate": 1e-05,
"loss": 0.0015,
"reward": 0.30664851516485214,
"reward_std": 0.03642770275473595,
"rewards/clip_reward": 0.30664851516485214,
"step": 121
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0037606152613165237,
"grad_norm": 0.10716440463937717,
"kl": 0.03912353515625,
"learning_rate": 1e-05,
"loss": 0.0016,
"reward": 0.3061741515994072,
"reward_std": 0.03255116753280163,
"rewards/clip_reward": 0.3061741515994072,
"step": 122
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0037914399765732166,
"grad_norm": 0.10921412355349838,
"kl": 0.04193115234375,
"learning_rate": 1e-05,
"loss": 0.0017,
"reward": 0.3017224445939064,
"reward_std": 0.035058747977018356,
"rewards/clip_reward": 0.3017224445939064,
"step": 123
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.003822264691829909,
"grad_norm": 0.10891562876371483,
"kl": 0.04351806640625,
"learning_rate": 1e-05,
"loss": 0.0017,
"reward": 0.3088080510497093,
"reward_std": 0.03202015720307827,
"rewards/clip_reward": 0.3088080510497093,
"step": 124
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.003853089407086602,
"grad_norm": 0.12400681002324238,
"kl": 0.0430908203125,
"learning_rate": 1e-05,
"loss": 0.0017,
"reward": 0.30519475787878036,
"reward_std": 0.04081529099494219,
"rewards/clip_reward": 0.30519475787878036,
"step": 125
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.003883914122343295,
"grad_norm": 0.10684242248297056,
"kl": 0.03955078125,
"learning_rate": 1e-05,
"loss": 0.0016,
"reward": 0.2999924272298813,
"reward_std": 0.027399181853979826,
"rewards/clip_reward": 0.2999924272298813,
"step": 126
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.003914738837599988,
"grad_norm": 0.13383282223405826,
"kl": 0.043212890625,
"learning_rate": 1e-05,
"loss": 0.0017,
"reward": 0.29980389028787613,
"reward_std": 0.02915497263893485,
"rewards/clip_reward": 0.29980389028787613,
"step": 127
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.003945563552856681,
"grad_norm": 0.10680237923679747,
"kl": 0.0423583984375,
"learning_rate": 1e-05,
"loss": 0.0017,
"reward": 0.3113924115896225,
"reward_std": 0.034374223090708256,
"rewards/clip_reward": 0.3113924115896225,
"step": 128
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0039763882681133735,
"grad_norm": 0.11633733033299526,
"kl": 0.0419921875,
"learning_rate": 1e-05,
"loss": 0.0017,
"reward": 0.3037688434123993,
"reward_std": 0.030008903238922358,
"rewards/clip_reward": 0.3037688434123993,
"step": 129
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.004007212983370066,
"grad_norm": 0.10824185375409869,
"kl": 0.043212890625,
"learning_rate": 1e-05,
"loss": 0.0017,
"reward": 0.3037646785378456,
"reward_std": 0.03128322120755911,
"rewards/clip_reward": 0.3037646785378456,
"step": 130
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.004038037698626759,
"grad_norm": 0.11498093646788146,
"kl": 0.04241943359375,
"learning_rate": 1e-05,
"loss": 0.0017,
"reward": 0.3007878288626671,
"reward_std": 0.035366450902074575,
"rewards/clip_reward": 0.3007878288626671,
"step": 131
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.004068862413883452,
"grad_norm": 0.10988923049203525,
"kl": 0.04412841796875,
"learning_rate": 1e-05,
"loss": 0.0018,
"reward": 0.2908458858728409,
"reward_std": 0.037285988219082355,
"rewards/clip_reward": 0.2908458858728409,
"step": 132
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.004099687129140144,
"grad_norm": 0.11966799658540882,
"kl": 0.0472412109375,
"learning_rate": 1e-05,
"loss": 0.0019,
"reward": 0.31911730766296387,
"reward_std": 0.03753689955919981,
"rewards/clip_reward": 0.31911730766296387,
"step": 133
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.004130511844396837,
"grad_norm": 0.11694661938666558,
"kl": 0.0423583984375,
"learning_rate": 1e-05,
"loss": 0.0017,
"reward": 0.2962675616145134,
"reward_std": 0.03178291115909815,
"rewards/clip_reward": 0.2962675616145134,
"step": 134
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.00416133655965353,
"grad_norm": 0.12087212424369766,
"kl": 0.04498291015625,
"learning_rate": 1e-05,
"loss": 0.0018,
"reward": 0.2907417491078377,
"reward_std": 0.039098432287573814,
"rewards/clip_reward": 0.2907417491078377,
"step": 135
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.004192161274910223,
"grad_norm": 0.1287243807650102,
"kl": 0.04473876953125,
"learning_rate": 1e-05,
"loss": 0.0018,
"reward": 0.3177812397480011,
"reward_std": 0.03574381256476045,
"rewards/clip_reward": 0.3177812397480011,
"step": 136
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.004222985990166916,
"grad_norm": 0.10737365706668978,
"kl": 0.04510498046875,
"learning_rate": 1e-05,
"loss": 0.0018,
"reward": 0.3081594184041023,
"reward_std": 0.038351588882505894,
"rewards/clip_reward": 0.3081594184041023,
"step": 137
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.004253810705423609,
"grad_norm": 0.10905175589597935,
"kl": 0.04962158203125,
"learning_rate": 1e-05,
"loss": 0.002,
"reward": 0.3101942911744118,
"reward_std": 0.036289566196501255,
"rewards/clip_reward": 0.3101942911744118,
"step": 138
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0042846354206803015,
"grad_norm": 0.11276085217923483,
"kl": 0.045166015625,
"learning_rate": 1e-05,
"loss": 0.0018,
"reward": 0.30623695254325867,
"reward_std": 0.03592977672815323,
"rewards/clip_reward": 0.30623695254325867,
"step": 139
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.004315460135936994,
"grad_norm": 0.11293325800639886,
"kl": 0.04168701171875,
"learning_rate": 1e-05,
"loss": 0.0017,
"reward": 0.30889374017715454,
"reward_std": 0.037142093293368816,
"rewards/clip_reward": 0.30889374017715454,
"step": 140
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.004346284851193687,
"grad_norm": 0.11777523910607922,
"kl": 0.0472412109375,
"learning_rate": 1e-05,
"loss": 0.0019,
"reward": 0.3147361949086189,
"reward_std": 0.03566309390589595,
"rewards/clip_reward": 0.3147361949086189,
"step": 141
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.00437710956645038,
"grad_norm": 0.11847683399423899,
"kl": 0.04412841796875,
"learning_rate": 1e-05,
"loss": 0.0018,
"reward": 0.30750318616628647,
"reward_std": 0.031021112576127052,
"rewards/clip_reward": 0.30750318616628647,
"step": 142
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.004407934281707073,
"grad_norm": 0.2325766643168979,
"kl": 0.04681396484375,
"learning_rate": 1e-05,
"loss": 0.0019,
"reward": 0.3154006227850914,
"reward_std": 0.03396408865228295,
"rewards/clip_reward": 0.3154006227850914,
"step": 143
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.004438758996963765,
"grad_norm": 0.12205440467431385,
"kl": 0.04742431640625,
"learning_rate": 1e-05,
"loss": 0.0019,
"reward": 0.3048614263534546,
"reward_std": 0.03609074279665947,
"rewards/clip_reward": 0.3048614263534546,
"step": 144
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.004469583712220458,
"grad_norm": 0.23235604214068906,
"kl": 0.04681396484375,
"learning_rate": 1e-05,
"loss": 0.0019,
"reward": 0.311428502202034,
"reward_std": 0.03360119927674532,
"rewards/clip_reward": 0.311428502202034,
"step": 145
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.004500408427477151,
"grad_norm": 0.10888274692722734,
"kl": 0.04388427734375,
"learning_rate": 1e-05,
"loss": 0.0018,
"reward": 0.31328508257865906,
"reward_std": 0.03548012813553214,
"rewards/clip_reward": 0.31328508257865906,
"step": 146
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.004531233142733844,
"grad_norm": 0.10880868321131348,
"kl": 0.04425048828125,
"learning_rate": 1e-05,
"loss": 0.0018,
"reward": 0.3104088082909584,
"reward_std": 0.03424055827781558,
"rewards/clip_reward": 0.3104088082909584,
"step": 147
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.004562057857990537,
"grad_norm": 0.13184897587638666,
"kl": 0.03912353515625,
"learning_rate": 1e-05,
"loss": 0.0016,
"reward": 0.32070276886224747,
"reward_std": 0.035730645060539246,
"rewards/clip_reward": 0.32070276886224747,
"step": 148
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.00459288257324723,
"grad_norm": 0.13040013350171695,
"kl": 0.036773681640625,
"learning_rate": 1e-05,
"loss": 0.0015,
"reward": 0.28690390288829803,
"reward_std": 0.031756586860865355,
"rewards/clip_reward": 0.28690390288829803,
"step": 149
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0046237072885039225,
"grad_norm": 0.11413869565578771,
"kl": 0.0396728515625,
"learning_rate": 1e-05,
"loss": 0.0016,
"reward": 0.29507312178611755,
"reward_std": 0.03319581504911184,
"rewards/clip_reward": 0.29507312178611755,
"step": 150
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.004654532003760615,
"grad_norm": 0.11075253321749193,
"kl": 0.04193115234375,
"learning_rate": 1e-05,
"loss": 0.0017,
"reward": 0.3052366375923157,
"reward_std": 0.03602536814287305,
"rewards/clip_reward": 0.3052366375923157,
"step": 151
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.004685356719017308,
"grad_norm": 0.10754829754075829,
"kl": 0.0465087890625,
"learning_rate": 1e-05,
"loss": 0.0019,
"reward": 0.32933981716632843,
"reward_std": 0.03219308517873287,
"rewards/clip_reward": 0.32933981716632843,
"step": 152
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.004716181434274001,
"grad_norm": 0.11311221875180785,
"kl": 0.03924560546875,
"learning_rate": 1e-05,
"loss": 0.0016,
"reward": 0.32051652669906616,
"reward_std": 0.03876081760972738,
"rewards/clip_reward": 0.32051652669906616,
"step": 153
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.004747006149530694,
"grad_norm": 0.10485333011345627,
"kl": 0.04144287109375,
"learning_rate": 1e-05,
"loss": 0.0017,
"reward": 0.3038826063275337,
"reward_std": 0.03334263851866126,
"rewards/clip_reward": 0.3038826063275337,
"step": 154
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.004777830864787387,
"grad_norm": 0.2148098423660176,
"kl": 0.05029296875,
"learning_rate": 1e-05,
"loss": 0.002,
"reward": 0.3050876185297966,
"reward_std": 0.03197276359423995,
"rewards/clip_reward": 0.3050876185297966,
"step": 155
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.004808655580044079,
"grad_norm": 0.11844316361896051,
"kl": 0.03863525390625,
"learning_rate": 1e-05,
"loss": 0.0015,
"reward": 0.32097869366407394,
"reward_std": 0.029324380215257406,
"rewards/clip_reward": 0.32097869366407394,
"step": 156
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.004839480295300772,
"grad_norm": 0.11792606634997942,
"kl": 0.04254150390625,
"learning_rate": 1e-05,
"loss": 0.0017,
"reward": 0.3132959380745888,
"reward_std": 0.027422321029007435,
"rewards/clip_reward": 0.3132959380745888,
"step": 157
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.004870305010557465,
"grad_norm": 0.11837395235594217,
"kl": 0.0396728515625,
"learning_rate": 1e-05,
"loss": 0.0016,
"reward": 0.3073809891939163,
"reward_std": 0.03928511310368776,
"rewards/clip_reward": 0.3073809891939163,
"step": 158
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.004901129725814158,
"grad_norm": 0.11129310536332628,
"kl": 0.0430908203125,
"learning_rate": 1e-05,
"loss": 0.0017,
"reward": 0.2984026074409485,
"reward_std": 0.03139211004599929,
"rewards/clip_reward": 0.2984026074409485,
"step": 159
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0049319544410708505,
"grad_norm": 0.11532428624713585,
"kl": 0.043212890625,
"learning_rate": 1e-05,
"loss": 0.0017,
"reward": 0.3008668124675751,
"reward_std": 0.02627889020368457,
"rewards/clip_reward": 0.3008668124675751,
"step": 160
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.004962779156327543,
"grad_norm": 0.11786021221814753,
"kl": 0.049560546875,
"learning_rate": 1e-05,
"loss": 0.002,
"reward": 0.28665469214320183,
"reward_std": 0.03704976849257946,
"rewards/clip_reward": 0.28665469214320183,
"step": 161
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.004993603871584236,
"grad_norm": 0.11010075273074264,
"kl": 0.0458984375,
"learning_rate": 1e-05,
"loss": 0.0018,
"reward": 0.3088128864765167,
"reward_std": 0.03618460427969694,
"rewards/clip_reward": 0.3088128864765167,
"step": 162
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.005024428586840929,
"grad_norm": 0.13239934947372414,
"kl": 0.04388427734375,
"learning_rate": 1e-05,
"loss": 0.0018,
"reward": 0.3036596402525902,
"reward_std": 0.032796021085232496,
"rewards/clip_reward": 0.3036596402525902,
"step": 163
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.005055253302097622,
"grad_norm": 0.11413695831587789,
"kl": 0.04144287109375,
"learning_rate": 1e-05,
"loss": 0.0017,
"reward": 0.304002046585083,
"reward_std": 0.029863339848816395,
"rewards/clip_reward": 0.304002046585083,
"step": 164
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.005086078017354315,
"grad_norm": 0.11564845823686422,
"kl": 0.04547119140625,
"learning_rate": 1e-05,
"loss": 0.0018,
"reward": 0.29191841185092926,
"reward_std": 0.035626471508294344,
"rewards/clip_reward": 0.29191841185092926,
"step": 165
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.005116902732611008,
"grad_norm": 0.11329154232078235,
"kl": 0.0458984375,
"learning_rate": 1e-05,
"loss": 0.0018,
"reward": 0.3206318989396095,
"reward_std": 0.031096128281205893,
"rewards/clip_reward": 0.3206318989396095,
"step": 166
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0051477274478677,
"grad_norm": 0.13085517741110167,
"kl": 0.04510498046875,
"learning_rate": 1e-05,
"loss": 0.0018,
"reward": 0.28686685115098953,
"reward_std": 0.028212732169777155,
"rewards/clip_reward": 0.28686685115098953,
"step": 167
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.005178552163124393,
"grad_norm": 0.35649465481543857,
"kl": 0.05712890625,
"learning_rate": 1e-05,
"loss": 0.0023,
"reward": 0.3119603246450424,
"reward_std": 0.03256976744160056,
"rewards/clip_reward": 0.3119603246450424,
"step": 168
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.005209376878381086,
"grad_norm": 0.13826424691903896,
"kl": 0.047607421875,
"learning_rate": 1e-05,
"loss": 0.0019,
"reward": 0.2963334918022156,
"reward_std": 0.035677722189575434,
"rewards/clip_reward": 0.2963334918022156,
"step": 169
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.005240201593637779,
"grad_norm": 0.1534567095384766,
"kl": 0.05511474609375,
"learning_rate": 1e-05,
"loss": 0.0022,
"reward": 0.3064122945070267,
"reward_std": 0.030035972129553556,
"rewards/clip_reward": 0.3064122945070267,
"step": 170
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0052710263088944715,
"grad_norm": 0.14535366087971044,
"kl": 0.0533447265625,
"learning_rate": 1e-05,
"loss": 0.0021,
"reward": 0.31411340832710266,
"reward_std": 0.03346576215699315,
"rewards/clip_reward": 0.31411340832710266,
"step": 171
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.005301851024151164,
"grad_norm": 0.11182081037914528,
"kl": 0.04864501953125,
"learning_rate": 1e-05,
"loss": 0.0019,
"reward": 0.31372761726379395,
"reward_std": 0.028209302574396133,
"rewards/clip_reward": 0.31372761726379395,
"step": 172
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.005332675739407857,
"grad_norm": 0.10998805611249249,
"kl": 0.0506591796875,
"learning_rate": 1e-05,
"loss": 0.002,
"reward": 0.3076253980398178,
"reward_std": 0.03530415939167142,
"rewards/clip_reward": 0.3076253980398178,
"step": 173
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.00536350045466455,
"grad_norm": 0.11782096135818707,
"kl": 0.04974365234375,
"learning_rate": 1e-05,
"loss": 0.002,
"reward": 0.31246717274188995,
"reward_std": 0.029364202171564102,
"rewards/clip_reward": 0.31246717274188995,
"step": 174
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.005394325169921243,
"grad_norm": 0.12510210623580764,
"kl": 0.045166015625,
"learning_rate": 1e-05,
"loss": 0.0018,
"reward": 0.3149753659963608,
"reward_std": 0.03486820124089718,
"rewards/clip_reward": 0.3149753659963608,
"step": 175
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.005425149885177936,
"grad_norm": 0.12073650226365598,
"kl": 0.04754638671875,
"learning_rate": 1e-05,
"loss": 0.0019,
"reward": 0.3087178245186806,
"reward_std": 0.03398646041750908,
"rewards/clip_reward": 0.3087178245186806,
"step": 176
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.005455974600434629,
"grad_norm": 0.16141440704371318,
"kl": 0.046630859375,
"learning_rate": 1e-05,
"loss": 0.0019,
"reward": 0.32408736646175385,
"reward_std": 0.03735980670899153,
"rewards/clip_reward": 0.32408736646175385,
"step": 177
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.005486799315691322,
"grad_norm": 0.12441494367513423,
"kl": 0.04547119140625,
"learning_rate": 1e-05,
"loss": 0.0018,
"reward": 0.32129447907209396,
"reward_std": 0.02921806275844574,
"rewards/clip_reward": 0.32129447907209396,
"step": 178
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.005517624030948014,
"grad_norm": 0.11472079048722336,
"kl": 0.04193115234375,
"learning_rate": 1e-05,
"loss": 0.0017,
"reward": 0.3252818211913109,
"reward_std": 0.032147477846592665,
"rewards/clip_reward": 0.3252818211913109,
"step": 179
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.005548448746204707,
"grad_norm": 0.11643678981280302,
"kl": 0.04559326171875,
"learning_rate": 1e-05,
"loss": 0.0018,
"reward": 0.31385669857263565,
"reward_std": 0.031697194557636976,
"rewards/clip_reward": 0.31385669857263565,
"step": 180
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0055792734614613995,
"grad_norm": 0.10985906766240239,
"kl": 0.0438232421875,
"learning_rate": 1e-05,
"loss": 0.0018,
"reward": 0.3218560591340065,
"reward_std": 0.030952177941799164,
"rewards/clip_reward": 0.3218560591340065,
"step": 181
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.005610098176718092,
"grad_norm": 0.12318124273748278,
"kl": 0.044189453125,
"learning_rate": 1e-05,
"loss": 0.0018,
"reward": 0.3146945610642433,
"reward_std": 0.037078809924423695,
"rewards/clip_reward": 0.3146945610642433,
"step": 182
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.005640922891974785,
"grad_norm": 0.10855110593795812,
"kl": 0.0469970703125,
"learning_rate": 1e-05,
"loss": 0.0019,
"reward": 0.3247036188840866,
"reward_std": 0.03742914833128452,
"rewards/clip_reward": 0.3247036188840866,
"step": 183
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.005671747607231478,
"grad_norm": 0.12463338581811485,
"kl": 0.04730224609375,
"learning_rate": 1e-05,
"loss": 0.0019,
"reward": 0.3007904663681984,
"reward_std": 0.030957046430557966,
"rewards/clip_reward": 0.3007904663681984,
"step": 184
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.005702572322488171,
"grad_norm": 0.11738351599765742,
"kl": 0.04608154296875,
"learning_rate": 1e-05,
"loss": 0.0018,
"reward": 0.3152424767613411,
"reward_std": 0.032630473375320435,
"rewards/clip_reward": 0.3152424767613411,
"step": 185
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.005733397037744864,
"grad_norm": 0.12425233709466331,
"kl": 0.0426025390625,
"learning_rate": 1e-05,
"loss": 0.0017,
"reward": 0.3105937987565994,
"reward_std": 0.037436836399137974,
"rewards/clip_reward": 0.3105937987565994,
"step": 186
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.005764221753001557,
"grad_norm": 0.24540233073575124,
"kl": 0.0604248046875,
"learning_rate": 1e-05,
"loss": 0.0024,
"reward": 0.3193615674972534,
"reward_std": 0.02923456858843565,
"rewards/clip_reward": 0.3193615674972534,
"step": 187
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.00579504646825825,
"grad_norm": 0.11122419357690229,
"kl": 0.04248046875,
"learning_rate": 1e-05,
"loss": 0.0017,
"reward": 0.29561520367860794,
"reward_std": 0.03093513334169984,
"rewards/clip_reward": 0.29561520367860794,
"step": 188
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.005825871183514943,
"grad_norm": 0.11723471576199568,
"kl": 0.04541015625,
"learning_rate": 1e-05,
"loss": 0.0018,
"reward": 0.3073701113462448,
"reward_std": 0.03550923429429531,
"rewards/clip_reward": 0.3073701113462448,
"step": 189
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.005856695898771635,
"grad_norm": 0.10846043413336356,
"kl": 0.04443359375,
"learning_rate": 1e-05,
"loss": 0.0018,
"reward": 0.3202974200248718,
"reward_std": 0.03128951042890549,
"rewards/clip_reward": 0.3202974200248718,
"step": 190
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.005887520614028328,
"grad_norm": 0.10684544727987075,
"kl": 0.04400634765625,
"learning_rate": 1e-05,
"loss": 0.0018,
"reward": 0.3150642290711403,
"reward_std": 0.038218459114432335,
"rewards/clip_reward": 0.3150642290711403,
"step": 191
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0059183453292850205,
"grad_norm": 0.12179014361836414,
"kl": 0.0406494140625,
"learning_rate": 1e-05,
"loss": 0.0016,
"reward": 0.3023750111460686,
"reward_std": 0.028901703655719757,
"rewards/clip_reward": 0.3023750111460686,
"step": 192
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.005949170044541713,
"grad_norm": 0.11287740721272002,
"kl": 0.04296875,
"learning_rate": 1e-05,
"loss": 0.0017,
"reward": 0.30537480115890503,
"reward_std": 0.03340973751619458,
"rewards/clip_reward": 0.30537480115890503,
"step": 193
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.005979994759798406,
"grad_norm": 0.11953948724468283,
"kl": 0.0406494140625,
"learning_rate": 1e-05,
"loss": 0.0016,
"reward": 0.2977057322859764,
"reward_std": 0.041334839537739754,
"rewards/clip_reward": 0.2977057322859764,
"step": 194
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.006010819475055099,
"grad_norm": 0.33669753751768144,
"kl": 0.05206298828125,
"learning_rate": 1e-05,
"loss": 0.0021,
"reward": 0.3174924701452255,
"reward_std": 0.03484439663589001,
"rewards/clip_reward": 0.3174924701452255,
"step": 195
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.006041644190311792,
"grad_norm": 0.13081401668031936,
"kl": 0.0428466796875,
"learning_rate": 1e-05,
"loss": 0.0017,
"reward": 0.31883813440799713,
"reward_std": 0.03002287307754159,
"rewards/clip_reward": 0.31883813440799713,
"step": 196
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.006072468905568485,
"grad_norm": 0.15362262357445108,
"kl": 0.04388427734375,
"learning_rate": 1e-05,
"loss": 0.0018,
"reward": 0.3148084282875061,
"reward_std": 0.03563790209591389,
"rewards/clip_reward": 0.3148084282875061,
"step": 197
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.006103293620825178,
"grad_norm": 0.12036606667480174,
"kl": 0.04522705078125,
"learning_rate": 1e-05,
"loss": 0.0018,
"reward": 0.3185431882739067,
"reward_std": 0.03547387337312102,
"rewards/clip_reward": 0.3185431882739067,
"step": 198
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.006134118336081871,
"grad_norm": 0.11265487428982754,
"kl": 0.04522705078125,
"learning_rate": 1e-05,
"loss": 0.0018,
"reward": 0.3013475388288498,
"reward_std": 0.027531601022928953,
"rewards/clip_reward": 0.3013475388288498,
"step": 199
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.006164943051338564,
"grad_norm": 0.12480397529853451,
"kl": 0.04632568359375,
"learning_rate": 1e-05,
"loss": 0.0019,
"reward": 0.2954695001244545,
"reward_std": 0.035041794180870056,
"rewards/clip_reward": 0.2954695001244545,
"step": 200
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0061957677665952565,
"grad_norm": 0.11326384693922859,
"kl": 0.04779052734375,
"learning_rate": 1e-05,
"loss": 0.0019,
"reward": 0.30719564110040665,
"reward_std": 0.03018721006810665,
"rewards/clip_reward": 0.30719564110040665,
"step": 201
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0062265924818519485,
"grad_norm": 0.14001014918782298,
"kl": 0.0477294921875,
"learning_rate": 1e-05,
"loss": 0.0019,
"reward": 0.27801472693681717,
"reward_std": 0.029184456914663315,
"rewards/clip_reward": 0.27801472693681717,
"step": 202
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.006257417197108641,
"grad_norm": 0.13598377277553347,
"kl": 0.04864501953125,
"learning_rate": 1e-05,
"loss": 0.0019,
"reward": 0.31848207861185074,
"reward_std": 0.029095898382365704,
"rewards/clip_reward": 0.31848207861185074,
"step": 203
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.006288241912365334,
"grad_norm": 0.12990427030922783,
"kl": 0.04669189453125,
"learning_rate": 1e-05,
"loss": 0.0019,
"reward": 0.3233681470155716,
"reward_std": 0.02884372603148222,
"rewards/clip_reward": 0.3233681470155716,
"step": 204
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.006319066627622027,
"grad_norm": 0.11337348510966652,
"kl": 0.04705810546875,
"learning_rate": 1e-05,
"loss": 0.0019,
"reward": 0.30711568146944046,
"reward_std": 0.027397962752729654,
"rewards/clip_reward": 0.30711568146944046,
"step": 205
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.00634989134287872,
"grad_norm": 0.11008239157225132,
"kl": 0.04534912109375,
"learning_rate": 1e-05,
"loss": 0.0018,
"reward": 0.31931688636541367,
"reward_std": 0.029018502216786146,
"rewards/clip_reward": 0.31931688636541367,
"step": 206
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.006380716058135413,
"grad_norm": 0.11005906943350524,
"kl": 0.04534912109375,
"learning_rate": 1e-05,
"loss": 0.0018,
"reward": 0.2889741063117981,
"reward_std": 0.03417292470112443,
"rewards/clip_reward": 0.2889741063117981,
"step": 207
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.006411540773392106,
"grad_norm": 0.13107793374479376,
"kl": 0.04388427734375,
"learning_rate": 1e-05,
"loss": 0.0018,
"reward": 0.316011942923069,
"reward_std": 0.035842195618897676,
"rewards/clip_reward": 0.316011942923069,
"step": 208
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.006442365488648799,
"grad_norm": 0.11921653233574052,
"kl": 0.04827880859375,
"learning_rate": 1e-05,
"loss": 0.0019,
"reward": 0.3120528683066368,
"reward_std": 0.03157835826277733,
"rewards/clip_reward": 0.3120528683066368,
"step": 209
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.006473190203905492,
"grad_norm": 0.11671423552831947,
"kl": 0.04705810546875,
"learning_rate": 1e-05,
"loss": 0.0019,
"reward": 0.3096734285354614,
"reward_std": 0.02532344777137041,
"rewards/clip_reward": 0.3096734285354614,
"step": 210
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0065040149191621845,
"grad_norm": 0.12478193702072561,
"kl": 0.04876708984375,
"learning_rate": 1e-05,
"loss": 0.002,
"reward": 0.3255714699625969,
"reward_std": 0.03476850828155875,
"rewards/clip_reward": 0.3255714699625969,
"step": 211
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.006534839634418877,
"grad_norm": 0.11234041456760836,
"kl": 0.043701171875,
"learning_rate": 1e-05,
"loss": 0.0017,
"reward": 0.3205392137169838,
"reward_std": 0.02879873849451542,
"rewards/clip_reward": 0.3205392137169838,
"step": 212
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0065656643496755695,
"grad_norm": 0.11484489412906124,
"kl": 0.04779052734375,
"learning_rate": 1e-05,
"loss": 0.0019,
"reward": 0.30573664605617523,
"reward_std": 0.034792355727404356,
"rewards/clip_reward": 0.30573664605617523,
"step": 213
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.006596489064932262,
"grad_norm": 0.112973116545128,
"kl": 0.0565185546875,
"learning_rate": 1e-05,
"loss": 0.0023,
"reward": 0.322362020611763,
"reward_std": 0.040699029341340065,
"rewards/clip_reward": 0.322362020611763,
"step": 214
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.006627313780188955,
"grad_norm": 0.11000967164413539,
"kl": 0.0496826171875,
"learning_rate": 1e-05,
"loss": 0.002,
"reward": 0.32849258929491043,
"reward_std": 0.03333634790033102,
"rewards/clip_reward": 0.32849258929491043,
"step": 215
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.006658138495445648,
"grad_norm": 0.11266446524081561,
"kl": 0.04681396484375,
"learning_rate": 1e-05,
"loss": 0.0019,
"reward": 0.3268875405192375,
"reward_std": 0.041359793394804,
"rewards/clip_reward": 0.3268875405192375,
"step": 216
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.006688963210702341,
"grad_norm": 0.1297650863707243,
"kl": 0.05572509765625,
"learning_rate": 1e-05,
"loss": 0.0022,
"reward": 0.3042534068226814,
"reward_std": 0.029003426898270845,
"rewards/clip_reward": 0.3042534068226814,
"step": 217
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.006719787925959034,
"grad_norm": 0.1224231057191533,
"kl": 0.0494384765625,
"learning_rate": 1e-05,
"loss": 0.002,
"reward": 0.322785347700119,
"reward_std": 0.03285923460498452,
"rewards/clip_reward": 0.322785347700119,
"step": 218
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.006750612641215727,
"grad_norm": 0.11197370609789116,
"kl": 0.0545654296875,
"learning_rate": 1e-05,
"loss": 0.0022,
"reward": 0.3130335509777069,
"reward_std": 0.028378690592944622,
"rewards/clip_reward": 0.3130335509777069,
"step": 219
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.00678143735647242,
"grad_norm": 0.12265292069342315,
"kl": 0.0526123046875,
"learning_rate": 1e-05,
"loss": 0.0021,
"reward": 0.3101393133401871,
"reward_std": 0.031088348012417555,
"rewards/clip_reward": 0.3101393133401871,
"step": 220
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.006812262071729113,
"grad_norm": 0.1301967649695606,
"kl": 0.0504150390625,
"learning_rate": 1e-05,
"loss": 0.002,
"reward": 0.3258736953139305,
"reward_std": 0.030232697259634733,
"rewards/clip_reward": 0.3258736953139305,
"step": 221
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0068430867869858055,
"grad_norm": 0.12267903014610237,
"kl": 0.05633544921875,
"learning_rate": 1e-05,
"loss": 0.0023,
"reward": 0.3135734647512436,
"reward_std": 0.029037311673164368,
"rewards/clip_reward": 0.3135734647512436,
"step": 222
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.006873911502242498,
"grad_norm": 0.1556499324095671,
"kl": 0.05694580078125,
"learning_rate": 1e-05,
"loss": 0.0023,
"reward": 0.3139733672142029,
"reward_std": 0.03242505481466651,
"rewards/clip_reward": 0.3139733672142029,
"step": 223
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.006904736217499191,
"grad_norm": 0.14040668849148338,
"kl": 0.05712890625,
"learning_rate": 1e-05,
"loss": 0.0023,
"reward": 0.2902194894850254,
"reward_std": 0.030722644180059433,
"rewards/clip_reward": 0.2902194894850254,
"step": 224
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.006935560932755883,
"grad_norm": 0.12469801158231446,
"kl": 0.05535888671875,
"learning_rate": 1e-05,
"loss": 0.0022,
"reward": 0.3171394020318985,
"reward_std": 0.032871958799660206,
"rewards/clip_reward": 0.3171394020318985,
"step": 225
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.006966385648012576,
"grad_norm": 0.11664661161299794,
"kl": 0.0555419921875,
"learning_rate": 1e-05,
"loss": 0.0022,
"reward": 0.30288901180028915,
"reward_std": 0.03361931908875704,
"rewards/clip_reward": 0.30288901180028915,
"step": 226
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.006997210363269269,
"grad_norm": 0.12368362093823822,
"kl": 0.0579833984375,
"learning_rate": 1e-05,
"loss": 0.0023,
"reward": 0.29918094724416733,
"reward_std": 0.032192114274948835,
"rewards/clip_reward": 0.29918094724416733,
"step": 227
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.007028035078525962,
"grad_norm": 0.13670193873160075,
"kl": 0.0626220703125,
"learning_rate": 1e-05,
"loss": 0.0025,
"reward": 0.30772917717695236,
"reward_std": 0.03168489225208759,
"rewards/clip_reward": 0.30772917717695236,
"step": 228
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.007058859793782655,
"grad_norm": 0.15557420264040966,
"kl": 0.053955078125,
"learning_rate": 1e-05,
"loss": 0.0022,
"reward": 0.28914518654346466,
"reward_std": 0.03747545275837183,
"rewards/clip_reward": 0.28914518654346466,
"step": 229
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.007089684509039348,
"grad_norm": 0.3182664775944573,
"kl": 0.08154296875,
"learning_rate": 1e-05,
"loss": 0.0033,
"reward": 0.311465322971344,
"reward_std": 0.03380277007818222,
"rewards/clip_reward": 0.311465322971344,
"step": 230
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.007120509224296041,
"grad_norm": 0.12693032913934957,
"kl": 0.0616455078125,
"learning_rate": 1e-05,
"loss": 0.0025,
"reward": 0.32272306829690933,
"reward_std": 0.03391577862203121,
"rewards/clip_reward": 0.32272306829690933,
"step": 231
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0071513339395527335,
"grad_norm": 4.296122241941977,
"kl": 0.45172119140625,
"learning_rate": 1e-05,
"loss": 0.0181,
"reward": 0.32933176308870316,
"reward_std": 0.03596270922571421,
"rewards/clip_reward": 0.32933176308870316,
"step": 232
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.007182158654809426,
"grad_norm": 0.12252452858451943,
"kl": 0.06103515625,
"learning_rate": 1e-05,
"loss": 0.0024,
"reward": 0.30073945224285126,
"reward_std": 0.035156805999577045,
"rewards/clip_reward": 0.30073945224285126,
"step": 233
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.007212983370066119,
"grad_norm": 0.18753510109293484,
"kl": 0.0567626953125,
"learning_rate": 1e-05,
"loss": 0.0023,
"reward": 0.3166045621037483,
"reward_std": 0.036252960562705994,
"rewards/clip_reward": 0.3166045621037483,
"step": 234
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.007243808085322812,
"grad_norm": 0.12244712083969155,
"kl": 0.0599365234375,
"learning_rate": 1e-05,
"loss": 0.0024,
"reward": 0.32098332792520523,
"reward_std": 0.02912920992821455,
"rewards/clip_reward": 0.32098332792520523,
"step": 235
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.007274632800579504,
"grad_norm": 0.1153924535202381,
"kl": 0.057373046875,
"learning_rate": 1e-05,
"loss": 0.0023,
"reward": 0.30156801640987396,
"reward_std": 0.03506749775260687,
"rewards/clip_reward": 0.30156801640987396,
"step": 236
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.007305457515836197,
"grad_norm": 982.1130719581894,
"kl": 11.4212646484375,
"learning_rate": 1e-05,
"loss": 0.4579,
"reward": 0.3112705275416374,
"reward_std": 0.03293308801949024,
"rewards/clip_reward": 0.3112705275416374,
"step": 237
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.00733628223109289,
"grad_norm": 0.13912658954576582,
"kl": 0.06134033203125,
"learning_rate": 1e-05,
"loss": 0.0025,
"reward": 0.31086497753858566,
"reward_std": 0.03602492017671466,
"rewards/clip_reward": 0.31086497753858566,
"step": 238
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.007367106946349583,
"grad_norm": 0.15900496360182007,
"kl": 0.072021484375,
"learning_rate": 1e-05,
"loss": 0.0029,
"reward": 0.301338754594326,
"reward_std": 0.03604850126430392,
"rewards/clip_reward": 0.301338754594326,
"step": 239
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.007397931661606276,
"grad_norm": 0.11682552010464033,
"kl": 0.069091796875,
"learning_rate": 1e-05,
"loss": 0.0028,
"reward": 0.310367226600647,
"reward_std": 0.033799303229898214,
"rewards/clip_reward": 0.310367226600647,
"step": 240
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.007428756376862969,
"grad_norm": 0.6484753019850781,
"kl": 0.1041259765625,
"learning_rate": 1e-05,
"loss": 0.0042,
"reward": 0.32271357625722885,
"reward_std": 0.03783900523558259,
"rewards/clip_reward": 0.32271357625722885,
"step": 241
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.007459581092119662,
"grad_norm": 0.12488153334727711,
"kl": 0.06195068359375,
"learning_rate": 1e-05,
"loss": 0.0025,
"reward": 0.30252621322870255,
"reward_std": 0.03500718716531992,
"rewards/clip_reward": 0.30252621322870255,
"step": 242
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.0074904058073763545,
"grad_norm": 0.11793262109697311,
"kl": 0.0692138671875,
"learning_rate": 1e-05,
"loss": 0.0028,
"reward": 0.3169676512479782,
"reward_std": 0.03632183885201812,
"rewards/clip_reward": 0.3169676512479782,
"step": 243
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.007521230522633047,
"grad_norm": 0.11916442015495367,
"kl": 0.06756591796875,
"learning_rate": 1e-05,
"loss": 0.0027,
"reward": 0.32120058685541153,
"reward_std": 0.03682229993864894,
"rewards/clip_reward": 0.32120058685541153,
"step": 244
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.00755205523788974,
"grad_norm": 0.13178282787519513,
"kl": 0.068115234375,
"learning_rate": 1e-05,
"loss": 0.0027,
"reward": 0.3041011393070221,
"reward_std": 0.03216708730906248,
"rewards/clip_reward": 0.3041011393070221,
"step": 245
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.007582879953146433,
"grad_norm": 0.1209756865386705,
"kl": 0.07470703125,
"learning_rate": 1e-05,
"loss": 0.003,
"reward": 0.30233804881572723,
"reward_std": 0.03544025868177414,
"rewards/clip_reward": 0.30233804881572723,
"step": 246
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.007613704668403126,
"grad_norm": 0.14259183948300264,
"kl": 0.082275390625,
"learning_rate": 1e-05,
"loss": 0.0033,
"reward": 0.297327883541584,
"reward_std": 0.03194120712578297,
"rewards/clip_reward": 0.297327883541584,
"step": 247
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.007644529383659818,
"grad_norm": 0.11966570448950067,
"kl": 0.07708740234375,
"learning_rate": 1e-05,
"loss": 0.0031,
"reward": 0.3017484247684479,
"reward_std": 0.03380720689892769,
"rewards/clip_reward": 0.3017484247684479,
"step": 248
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.007675354098916511,
"grad_norm": 0.11584912623106094,
"kl": 0.079345703125,
"learning_rate": 1e-05,
"loss": 0.0032,
"reward": 0.2936403974890709,
"reward_std": 0.03349851304665208,
"rewards/clip_reward": 0.2936403974890709,
"step": 249
},
{
"clip_ratio": 0.0,
"completion_length": 4096.0,
"epoch": 0.007706178814173204,
"grad_norm": 0.13824008661423945,
"kl": 0.0780029296875,
"learning_rate": 1e-05,
"loss": 0.0031,
"reward": 0.3258681297302246,
"reward_std": 0.04007330071181059,
"rewards/clip_reward": 0.3258681297302246,
"step": 250
}
],
"logging_steps": 1,
"max_steps": 32441,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 10,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 6,
"trial_name": null,
"trial_params": null
}