Qwen2.5-7B-Instruct-TUTOR-RL-300 / trainer_state.json
rd211's picture
Upload folder using huggingface_hub
1fb9f35 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.12149532710280374,
"eval_steps": 500,
"global_step": 130,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 2213.6484375,
"epoch": 0.0009345794392523365,
"grad_norm": 0.9274865773093601,
"kl": 0.0,
"learning_rate": 5e-07,
"loss": 0.0131,
"reward": 0.3671875114669092,
"reward_std": 0.3501587579958141,
"rewards/end_of_conversation_reward_func": 0.05468750116415322,
"rewards/end_rm_reward_func": 0.3125,
"rewards/length_reward_func": 0.0,
"rewards/thinking_reward_func": 0.0,
"step": 1
},
{
"clip_ratio": 0.025033144396729767,
"epoch": 0.001869158878504673,
"grad_norm": 0.8911286780735345,
"kl": 0.006732940673828125,
"learning_rate": 5e-07,
"loss": 0.0136,
"step": 2
},
{
"clip_ratio": 0.0,
"completion_length": 1924.0390625,
"epoch": 0.002803738317757009,
"grad_norm": 1.0812826786849123,
"kl": 0.007961273193359375,
"learning_rate": 5e-07,
"loss": 0.0103,
"reward": 0.30449220282025635,
"reward_std": 0.3021779216360301,
"rewards/end_of_conversation_reward_func": 0.07890625158324838,
"rewards/end_rm_reward_func": 0.2255859375,
"rewards/length_reward_func": 0.0,
"rewards/thinking_reward_func": 0.0,
"step": 3
},
{
"clip_ratio": 0.02741119544953108,
"epoch": 0.003738317757009346,
"grad_norm": 1.0223081613434297,
"kl": 0.007640838623046875,
"learning_rate": 5e-07,
"loss": 0.0106,
"step": 4
},
{
"clip_ratio": 0.0,
"completion_length": 2317.28125,
"epoch": 0.004672897196261682,
"grad_norm": 0.9173763955261445,
"kl": 0.006732940673828125,
"learning_rate": 5e-07,
"loss": 0.0504,
"reward": 0.23222656873986125,
"reward_std": 0.2611159069929272,
"rewards/end_of_conversation_reward_func": 0.0632812503608875,
"rewards/end_rm_reward_func": 0.1689453125,
"rewards/length_reward_func": 0.0,
"rewards/thinking_reward_func": 0.0,
"step": 5
},
{
"clip_ratio": 0.025625309790484607,
"epoch": 0.005607476635514018,
"grad_norm": 0.9085778304220192,
"kl": 0.0066356658935546875,
"learning_rate": 5e-07,
"loss": 0.0505,
"step": 6
},
{
"clip_ratio": 0.0,
"completion_length": 1964.3046875,
"epoch": 0.0065420560747663555,
"grad_norm": 1.0789728731378576,
"kl": 0.00862884521484375,
"learning_rate": 5e-07,
"loss": 0.0348,
"reward": 0.39062501583248377,
"reward_std": 0.3314252281561494,
"rewards/end_of_conversation_reward_func": 0.0703125016298145,
"rewards/end_rm_reward_func": 0.3203125,
"rewards/length_reward_func": 0.0,
"rewards/thinking_reward_func": 0.0,
"step": 7
},
{
"clip_ratio": 0.027227300102822483,
"epoch": 0.007476635514018692,
"grad_norm": 1.012376163737541,
"kl": 0.009044647216796875,
"learning_rate": 5e-07,
"loss": 0.0351,
"step": 8
},
{
"clip_ratio": 0.0,
"completion_length": 2030.8828125,
"epoch": 0.008411214953271028,
"grad_norm": 0.9821357312327096,
"kl": 0.00811767578125,
"learning_rate": 5e-07,
"loss": 0.0024,
"reward": 0.4171875137835741,
"reward_std": 0.31918896292336285,
"rewards/end_of_conversation_reward_func": 0.06171875057043508,
"rewards/end_rm_reward_func": 0.35546875,
"rewards/length_reward_func": 0.0,
"rewards/thinking_reward_func": 0.0,
"step": 9
},
{
"clip_ratio": 0.02804350107908249,
"epoch": 0.009345794392523364,
"grad_norm": 0.9187020172705397,
"kl": 0.01328277587890625,
"learning_rate": 5e-07,
"loss": 0.0028,
"step": 10
},
{
"clip_ratio": 0.0,
"completion_length": 2098.609375,
"epoch": 0.010280373831775701,
"grad_norm": 1.0389274688932995,
"kl": 0.013088226318359375,
"learning_rate": 5e-07,
"loss": 0.0005,
"reward": 0.38964845007285476,
"reward_std": 0.3262035925872624,
"rewards/end_of_conversation_reward_func": 0.07031250116415322,
"rewards/end_rm_reward_func": 0.3193359375,
"rewards/length_reward_func": 0.0,
"rewards/thinking_reward_func": 0.0,
"step": 11
},
{
"clip_ratio": 0.02878896868787706,
"epoch": 0.011214953271028037,
"grad_norm": 1.005444013858441,
"kl": 0.015062332153320312,
"learning_rate": 5e-07,
"loss": 0.0011,
"step": 12
},
{
"clip_ratio": 0.0,
"completion_length": 1921.2734375,
"epoch": 0.012149532710280374,
"grad_norm": 1.0317328787031124,
"kl": 0.013071060180664062,
"learning_rate": 5e-07,
"loss": 0.0341,
"reward": 0.3585937600582838,
"reward_std": 0.3541586115024984,
"rewards/end_of_conversation_reward_func": 0.06171875016298145,
"rewards/end_rm_reward_func": 0.296875,
"rewards/length_reward_func": 0.0,
"rewards/thinking_reward_func": 0.0,
"step": 13
},
{
"clip_ratio": 0.029582467046566308,
"epoch": 0.013084112149532711,
"grad_norm": 0.9635840549431879,
"kl": 0.01287078857421875,
"learning_rate": 5e-07,
"loss": 0.0345,
"step": 14
},
{
"clip_ratio": 0.0,
"completion_length": 2046.6484375,
"epoch": 0.014018691588785047,
"grad_norm": 1.0353766722235376,
"kl": 0.012033462524414062,
"learning_rate": 5e-07,
"loss": 0.0376,
"reward": 0.31542970187729225,
"reward_std": 0.308479615021497,
"rewards/end_of_conversation_reward_func": 0.054687501513399184,
"rewards/end_rm_reward_func": 0.2607421875,
"rewards/length_reward_func": 0.0,
"rewards/thinking_reward_func": 0.0,
"step": 15
},
{
"clip_ratio": 0.028461063280701637,
"epoch": 0.014953271028037384,
"grad_norm": 0.9802840358814225,
"kl": 0.013523101806640625,
"learning_rate": 5e-07,
"loss": 0.0384,
"step": 16
},
{
"clip_ratio": 0.0,
"completion_length": 1996.390625,
"epoch": 0.01588785046728972,
"grad_norm": 1.0375805713420467,
"kl": 0.014972686767578125,
"learning_rate": 5e-07,
"loss": 0.0257,
"reward": 0.3818359471624717,
"reward_std": 0.2914374100510031,
"rewards/end_of_conversation_reward_func": 0.058593750873114914,
"rewards/end_rm_reward_func": 0.3232421875,
"rewards/length_reward_func": 0.0,
"rewards/thinking_reward_func": 0.0,
"step": 17
},
{
"clip_ratio": 0.027674222481437027,
"epoch": 0.016822429906542057,
"grad_norm": 1.0160437873794967,
"kl": 0.02030181884765625,
"learning_rate": 5e-07,
"loss": 0.0264,
"step": 18
},
{
"clip_ratio": 0.0,
"completion_length": 1832.9375,
"epoch": 0.017757009345794394,
"grad_norm": 1.0312841889104536,
"kl": 0.033977508544921875,
"learning_rate": 5e-07,
"loss": 0.0194,
"reward": 0.40410157246515155,
"reward_std": 0.3510888592572883,
"rewards/end_of_conversation_reward_func": 0.059375000768341124,
"rewards/end_rm_reward_func": 0.3447265625,
"rewards/length_reward_func": 0.0,
"rewards/thinking_reward_func": 0.0,
"step": 19
},
{
"clip_ratio": 0.027633688994683325,
"epoch": 0.018691588785046728,
"grad_norm": 0.9678211628152914,
"kl": 7.021579742431641,
"learning_rate": 5e-07,
"loss": 0.0196,
"step": 20
},
{
"clip_ratio": 0.0,
"completion_length": 2241.8359375,
"epoch": 0.019626168224299065,
"grad_norm": 1.0128829331154197,
"kl": 0.0332489013671875,
"learning_rate": 5e-07,
"loss": 0.0264,
"reward": 0.2441406348370947,
"reward_std": 0.2879671745467931,
"rewards/end_of_conversation_reward_func": 0.07421875081490725,
"rewards/end_rm_reward_func": 0.169921875,
"rewards/length_reward_func": 0.0,
"rewards/thinking_reward_func": 0.0,
"step": 21
},
{
"clip_ratio": 0.02383307204581797,
"epoch": 0.020560747663551402,
"grad_norm": 0.9296707720991904,
"kl": 0.040599822998046875,
"learning_rate": 5e-07,
"loss": 0.0268,
"step": 22
},
{
"clip_ratio": 0.0,
"completion_length": 2584.5,
"epoch": 0.02149532710280374,
"grad_norm": 0.8926960463526834,
"kl": 0.7261238098144531,
"learning_rate": 5e-07,
"loss": 0.0246,
"reward": 0.3001953187631443,
"reward_std": 0.31693244143389165,
"rewards/end_of_conversation_reward_func": 0.05703125102445483,
"rewards/end_rm_reward_func": 0.2431640625,
"rewards/length_reward_func": 0.0,
"rewards/thinking_reward_func": 0.0,
"step": 23
},
{
"clip_ratio": 0.025873058126308024,
"epoch": 0.022429906542056073,
"grad_norm": 0.8345013055872595,
"kl": 0.16021347045898438,
"learning_rate": 5e-07,
"loss": 0.025,
"step": 24
},
{
"clip_ratio": 0.0,
"completion_length": 1748.03125,
"epoch": 0.02336448598130841,
"grad_norm": 1.1173548960610717,
"kl": 0.04956817626953125,
"learning_rate": 5e-07,
"loss": -0.0013,
"reward": 0.5044922037050128,
"reward_std": 0.407341014361009,
"rewards/end_of_conversation_reward_func": 0.06796875083819032,
"rewards/end_rm_reward_func": 0.4365234375,
"rewards/length_reward_func": 0.0,
"rewards/thinking_reward_func": 0.0,
"step": 25
},
{
"clip_ratio": 0.029045677627436817,
"epoch": 0.024299065420560748,
"grad_norm": 1.0584679999454167,
"kl": 0.05883026123046875,
"learning_rate": 5e-07,
"loss": -0.0007,
"step": 26
},
{
"clip_ratio": 0.0,
"completion_length": 1884.03125,
"epoch": 0.025233644859813085,
"grad_norm": 1.0128102944540516,
"kl": 0.0649871826171875,
"learning_rate": 5e-07,
"loss": 0.0393,
"reward": 0.4763671956025064,
"reward_std": 0.3862752839922905,
"rewards/end_of_conversation_reward_func": 0.059375000768341124,
"rewards/end_rm_reward_func": 0.4169921875,
"rewards/length_reward_func": 0.0,
"rewards/thinking_reward_func": 0.0,
"step": 27
},
{
"clip_ratio": 0.028907910105772316,
"epoch": 0.026168224299065422,
"grad_norm": 0.9150592324011979,
"kl": 0.1536865234375,
"learning_rate": 5e-07,
"loss": 0.0398,
"step": 28
},
{
"clip_ratio": 0.0,
"completion_length": 1675.265625,
"epoch": 0.027102803738317756,
"grad_norm": 1.184705074990119,
"kl": 0.241058349609375,
"learning_rate": 5e-07,
"loss": 0.0311,
"reward": 0.3828125111758709,
"reward_std": 0.31181320175528526,
"rewards/end_of_conversation_reward_func": 0.05859375064028427,
"rewards/end_rm_reward_func": 0.32421875,
"rewards/length_reward_func": 0.0,
"rewards/thinking_reward_func": 0.0,
"step": 29
},
{
"clip_ratio": 0.02990245760884136,
"epoch": 0.028037383177570093,
"grad_norm": 1.1428586426066105,
"kl": 3.1220054626464844,
"learning_rate": 5e-07,
"loss": 0.0316,
"step": 30
},
{
"clip_ratio": 0.0,
"completion_length": 2076.8046875,
"epoch": 0.02897196261682243,
"grad_norm": 1.1023210621247226,
"kl": 0.1524505615234375,
"learning_rate": 5e-07,
"loss": 0.0032,
"reward": 0.3175781366880983,
"reward_std": 0.3118708392139524,
"rewards/end_of_conversation_reward_func": 0.06562500062864274,
"rewards/end_rm_reward_func": 0.251953125,
"rewards/length_reward_func": 0.0,
"rewards/thinking_reward_func": 0.0,
"step": 31
},
{
"clip_ratio": 0.028195369872264564,
"epoch": 0.029906542056074768,
"grad_norm": 1.065397234398234,
"kl": 0.14312744140625,
"learning_rate": 5e-07,
"loss": 0.0037,
"step": 32
},
{
"clip_ratio": 0.0,
"completion_length": 1835.15625,
"epoch": 0.0308411214953271,
"grad_norm": 1.0883386533644912,
"kl": 0.1009674072265625,
"learning_rate": 5e-07,
"loss": 0.006,
"reward": 0.47109376499429345,
"reward_std": 0.4281590711325407,
"rewards/end_of_conversation_reward_func": 0.06875000108266249,
"rewards/end_rm_reward_func": 0.40234375,
"rewards/length_reward_func": 0.0,
"rewards/thinking_reward_func": 0.0,
"step": 33
},
{
"clip_ratio": 0.030578713049180806,
"epoch": 0.03177570093457944,
"grad_norm": 1.0103915525345888,
"kl": 0.08032989501953125,
"learning_rate": 5e-07,
"loss": 0.0067,
"step": 34
},
{
"clip_ratio": 0.0,
"completion_length": 2264.5703125,
"epoch": 0.03271028037383177,
"grad_norm": 1.0203057509066855,
"kl": 2.56329345703125,
"learning_rate": 5e-07,
"loss": -0.0068,
"reward": 0.314453131519258,
"reward_std": 0.2885858778608963,
"rewards/end_of_conversation_reward_func": 0.06640625017462298,
"rewards/end_rm_reward_func": 0.248046875,
"rewards/length_reward_func": 0.0,
"rewards/thinking_reward_func": 0.0,
"step": 35
},
{
"clip_ratio": 0.028288635658100247,
"epoch": 0.03364485981308411,
"grad_norm": 0.9275253113815312,
"kl": 2.92437744140625,
"learning_rate": 5e-07,
"loss": -0.0062,
"step": 36
},
{
"clip_ratio": 0.0,
"completion_length": 2376.609375,
"epoch": 0.03457943925233645,
"grad_norm": 0.9618697576631778,
"kl": 0.0479736328125,
"learning_rate": 5e-07,
"loss": -0.0045,
"reward": 0.28437500644940883,
"reward_std": 0.3411878102924675,
"rewards/end_of_conversation_reward_func": 0.053906251036096364,
"rewards/end_rm_reward_func": 0.23046875,
"rewards/length_reward_func": 0.0,
"rewards/thinking_reward_func": 0.0,
"step": 37
},
{
"clip_ratio": 0.024988370947539806,
"epoch": 0.03551401869158879,
"grad_norm": 0.8895332217423063,
"kl": 0.04958343505859375,
"learning_rate": 5e-07,
"loss": -0.0041,
"step": 38
},
{
"clip_ratio": 0.0,
"completion_length": 1902.1640625,
"epoch": 0.03644859813084112,
"grad_norm": 1.107891872929539,
"kl": 11.097213745117188,
"learning_rate": 5e-07,
"loss": 0.0181,
"reward": 0.3992187549592927,
"reward_std": 0.31114612985402346,
"rewards/end_of_conversation_reward_func": 0.07109375146683306,
"rewards/end_rm_reward_func": 0.328125,
"rewards/length_reward_func": 0.0,
"rewards/thinking_reward_func": 0.0,
"step": 39
},
{
"clip_ratio": 0.02839432912878692,
"epoch": 0.037383177570093455,
"grad_norm": 1.0581025263351234,
"kl": 1.0611953735351562,
"learning_rate": 5e-07,
"loss": 0.0188,
"step": 40
},
{
"clip_ratio": 0.0,
"completion_length": 1947.9765625,
"epoch": 0.038317757009345796,
"grad_norm": 1.044504336049714,
"kl": 2384.179656982422,
"learning_rate": 5e-07,
"loss": 0.0119,
"reward": 0.349609378259629,
"reward_std": 0.365347285522148,
"rewards/end_of_conversation_reward_func": 0.07031250110594556,
"rewards/end_rm_reward_func": 0.279296875,
"rewards/length_reward_func": 0.0,
"rewards/thinking_reward_func": 0.0,
"step": 41
},
{
"clip_ratio": 0.0260704318061471,
"epoch": 0.03925233644859813,
"grad_norm": 0.9994876043774379,
"kl": 992.1758270263672,
"learning_rate": 5e-07,
"loss": 0.0123,
"step": 42
},
{
"clip_ratio": 0.0,
"completion_length": 1981.40625,
"epoch": 0.04018691588785047,
"grad_norm": 1.13597665166102,
"kl": 4.000732421875,
"learning_rate": 5e-07,
"loss": 0.0065,
"reward": 0.28535156534053385,
"reward_std": 0.2903470410965383,
"rewards/end_of_conversation_reward_func": 0.05000000126892701,
"rewards/end_rm_reward_func": 0.2353515625,
"rewards/length_reward_func": 0.0,
"rewards/thinking_reward_func": 0.0,
"step": 43
},
{
"clip_ratio": 0.028302057995460927,
"epoch": 0.041121495327102804,
"grad_norm": 0.9919056528612524,
"kl": 0.5160751342773438,
"learning_rate": 5e-07,
"loss": 0.0069,
"step": 44
},
{
"clip_ratio": 0.0,
"completion_length": 2229.96875,
"epoch": 0.04205607476635514,
"grad_norm": 1.1389480742032796,
"kl": 0.0689239501953125,
"learning_rate": 5e-07,
"loss": 0.0238,
"reward": 0.4009765740483999,
"reward_std": 0.32002427359111607,
"rewards/end_of_conversation_reward_func": 0.0601562510128133,
"rewards/end_rm_reward_func": 0.3408203125,
"rewards/length_reward_func": 0.0,
"rewards/thinking_reward_func": 0.0,
"step": 45
},
{
"clip_ratio": 0.028976862784475088,
"epoch": 0.04299065420560748,
"grad_norm": 1.0789195240827836,
"kl": 0.471649169921875,
"learning_rate": 5e-07,
"loss": 0.0244,
"step": 46
},
{
"clip_ratio": 0.0,
"completion_length": 2015.75,
"epoch": 0.04392523364485981,
"grad_norm": 1.0614224174398665,
"kl": 290.30157470703125,
"learning_rate": 5e-07,
"loss": 0.0326,
"reward": 0.43984375934815034,
"reward_std": 0.35610848292708397,
"rewards/end_of_conversation_reward_func": 0.06875000090803951,
"rewards/end_rm_reward_func": 0.37109375,
"rewards/length_reward_func": 0.0,
"rewards/thinking_reward_func": 0.0,
"step": 47
},
{
"clip_ratio": 0.029352403013035655,
"epoch": 0.044859813084112146,
"grad_norm": 1.011898536861839,
"kl": 478.43064880371094,
"learning_rate": 5e-07,
"loss": 0.0332,
"step": 48
},
{
"clip_ratio": 0.0,
"completion_length": 2297.375,
"epoch": 0.04579439252336449,
"grad_norm": 1.0458046562803893,
"kl": 0.3959197998046875,
"learning_rate": 5e-07,
"loss": 0.0264,
"reward": 0.40878907358273864,
"reward_std": 0.41261982172727585,
"rewards/end_of_conversation_reward_func": 0.06015625095460564,
"rewards/end_rm_reward_func": 0.3486328125,
"rewards/length_reward_func": 0.0,
"rewards/thinking_reward_func": 0.0,
"step": 49
},
{
"clip_ratio": 0.02658259856980294,
"epoch": 0.04672897196261682,
"grad_norm": 1.0013387198061083,
"kl": 0.1452178955078125,
"learning_rate": 5e-07,
"loss": 0.027,
"step": 50
},
{
"clip_ratio": 0.0,
"completion_length": 1941.359375,
"epoch": 0.04766355140186916,
"grad_norm": 1.043020590108944,
"kl": 0.4831390380859375,
"learning_rate": 5e-07,
"loss": 0.0261,
"reward": 0.36464845400769264,
"reward_std": 0.2816849281080067,
"rewards/end_of_conversation_reward_func": 0.06875000079162419,
"rewards/end_rm_reward_func": 0.2958984375,
"rewards/length_reward_func": 0.0,
"rewards/thinking_reward_func": 0.0,
"step": 51
},
{
"clip_ratio": 0.026326473569497466,
"epoch": 0.048598130841121495,
"grad_norm": 0.9484135414702769,
"kl": 0.2133331298828125,
"learning_rate": 5e-07,
"loss": 0.0265,
"step": 52
},
{
"clip_ratio": 0.0,
"completion_length": 2065.890625,
"epoch": 0.04953271028037383,
"grad_norm": 1.0419315292071203,
"kl": 0.9266204833984375,
"learning_rate": 5e-07,
"loss": 0.0207,
"reward": 0.4900390843395144,
"reward_std": 0.30717412871308625,
"rewards/end_of_conversation_reward_func": 0.07109375123400241,
"rewards/end_rm_reward_func": 0.4189453125,
"rewards/length_reward_func": 0.0,
"rewards/thinking_reward_func": 0.0,
"step": 53
},
{
"clip_ratio": 0.0251879645511508,
"epoch": 0.05046728971962617,
"grad_norm": 0.9533110472132086,
"kl": 0.435760498046875,
"learning_rate": 5e-07,
"loss": 0.0213,
"step": 54
},
{
"clip_ratio": 0.0,
"completion_length": 1789.953125,
"epoch": 0.0514018691588785,
"grad_norm": 1.1217425421263005,
"kl": 0.214447021484375,
"learning_rate": 5e-07,
"loss": 0.0412,
"reward": 0.4402343900874257,
"reward_std": 0.3780560018494725,
"rewards/end_of_conversation_reward_func": 0.07890625135041773,
"rewards/end_rm_reward_func": 0.361328125,
"rewards/length_reward_func": 0.0,
"rewards/thinking_reward_func": 0.0,
"step": 55
},
{
"clip_ratio": 0.030622108606621623,
"epoch": 0.052336448598130844,
"grad_norm": 1.0381235178271138,
"kl": 0.4748382568359375,
"learning_rate": 5e-07,
"loss": 0.0415,
"step": 56
},
{
"clip_ratio": 0.0,
"completion_length": 2055.359375,
"epoch": 0.05327102803738318,
"grad_norm": 1.1876220917504008,
"kl": 1.00286865234375,
"learning_rate": 5e-07,
"loss": 0.0177,
"reward": 0.37011719890870154,
"reward_std": 0.3370174712035805,
"rewards/end_of_conversation_reward_func": 0.07421875145519152,
"rewards/end_rm_reward_func": 0.2958984375,
"rewards/length_reward_func": 0.0,
"rewards/thinking_reward_func": 0.0,
"step": 57
},
{
"clip_ratio": 0.02977730438578874,
"epoch": 0.05420560747663551,
"grad_norm": 1.068202991310054,
"kl": 0.8372344970703125,
"learning_rate": 5e-07,
"loss": 0.0182,
"step": 58
},
{
"clip_ratio": 0.0,
"completion_length": 2018.375,
"epoch": 0.05514018691588785,
"grad_norm": 1.1523841294545756,
"kl": 1.866424560546875,
"learning_rate": 5e-07,
"loss": 0.0263,
"reward": 0.41113282507285476,
"reward_std": 0.3193635992356576,
"rewards/end_of_conversation_reward_func": 0.07031250093132257,
"rewards/end_rm_reward_func": 0.3408203125,
"rewards/length_reward_func": 0.0,
"rewards/thinking_reward_func": 0.0,
"step": 59
},
{
"clip_ratio": 0.02743027382530272,
"epoch": 0.056074766355140186,
"grad_norm": 1.045749009073609,
"kl": 0.7020263671875,
"learning_rate": 5e-07,
"loss": 0.0268,
"step": 60
},
{
"clip_ratio": 0.0,
"completion_length": 1794.4140625,
"epoch": 0.05700934579439252,
"grad_norm": 1.0912447227651245,
"kl": 0.14093017578125,
"learning_rate": 5e-07,
"loss": -0.0038,
"reward": 0.36425782297737896,
"reward_std": 0.32298252754844725,
"rewards/end_of_conversation_reward_func": 0.06250000034924597,
"rewards/end_rm_reward_func": 0.3017578125,
"rewards/length_reward_func": 0.0,
"rewards/thinking_reward_func": 0.0,
"step": 61
},
{
"clip_ratio": 0.02964514575432986,
"epoch": 0.05794392523364486,
"grad_norm": 1.018769038639198,
"kl": 0.180816650390625,
"learning_rate": 5e-07,
"loss": -0.0035,
"step": 62
},
{
"clip_ratio": 0.0,
"completion_length": 2075.7890625,
"epoch": 0.058878504672897194,
"grad_norm": 1.1420919911365868,
"kl": 2.07659912109375,
"learning_rate": 5e-07,
"loss": 0.0474,
"reward": 0.4462890678551048,
"reward_std": 0.3219987105112523,
"rewards/end_of_conversation_reward_func": 0.05078125046566129,
"rewards/end_rm_reward_func": 0.3955078125,
"rewards/length_reward_func": 0.0,
"rewards/thinking_reward_func": 0.0,
"step": 63
},
{
"clip_ratio": 0.026783736771903932,
"epoch": 0.059813084112149535,
"grad_norm": 1.0783334603422854,
"kl": 0.763427734375,
"learning_rate": 5e-07,
"loss": 0.0478,
"step": 64
},
{
"clip_ratio": 0.0,
"completion_length": 1749.0703125,
"epoch": 0.06074766355140187,
"grad_norm": 11.641215369666991,
"kl": 1.021759033203125,
"learning_rate": 5e-07,
"loss": -0.0107,
"reward": 0.38496094732545316,
"reward_std": 0.28824072727002203,
"rewards/end_of_conversation_reward_func": 0.06171875091968104,
"rewards/end_rm_reward_func": 0.3232421875,
"rewards/length_reward_func": 0.0,
"rewards/thinking_reward_func": 0.0,
"step": 65
},
{
"clip_ratio": 0.026883197599090636,
"epoch": 0.0616822429906542,
"grad_norm": 1.1308530803681143,
"kl": 0.78778076171875,
"learning_rate": 5e-07,
"loss": -0.0124,
"step": 66
},
{
"clip_ratio": 0.0,
"completion_length": 2340.5703125,
"epoch": 0.06261682242990654,
"grad_norm": 1.0325499609090654,
"kl": 0.6973876953125,
"learning_rate": 5e-07,
"loss": 0.0291,
"reward": 0.2919921954162419,
"reward_std": 0.372808160725981,
"rewards/end_of_conversation_reward_func": 0.0625000010477379,
"rewards/end_rm_reward_func": 0.2294921875,
"rewards/length_reward_func": 0.0,
"rewards/thinking_reward_func": 0.0,
"step": 67
},
{
"clip_ratio": 0.024969650781713426,
"epoch": 0.06355140186915888,
"grad_norm": 0.9501171001953348,
"kl": 0.953125,
"learning_rate": 5e-07,
"loss": 0.0295,
"step": 68
},
{
"clip_ratio": 0.0,
"completion_length": 1993.921875,
"epoch": 0.06448598130841121,
"grad_norm": 1.371250349290397,
"kl": 48.9132080078125,
"learning_rate": 5e-07,
"loss": 0.0288,
"reward": 0.33339844923466444,
"reward_std": 0.3135456896852702,
"rewards/end_of_conversation_reward_func": 0.07656250102445483,
"rewards/end_rm_reward_func": 0.2568359375,
"rewards/length_reward_func": 0.0,
"rewards/thinking_reward_func": 0.0,
"step": 69
},
{
"clip_ratio": 0.02892337238881737,
"epoch": 0.06542056074766354,
"grad_norm": 1.113720408056213,
"kl": 1089.4842834472656,
"learning_rate": 5e-07,
"loss": 0.0292,
"step": 70
},
{
"clip_ratio": 0.0,
"completion_length": 2149.3671875,
"epoch": 0.06635514018691589,
"grad_norm": 65.96446607868502,
"kl": 1.7677001953125,
"learning_rate": 5e-07,
"loss": 0.032,
"reward": 0.33554687892319635,
"reward_std": 0.2645741559099406,
"rewards/end_of_conversation_reward_func": 0.06406250077998266,
"rewards/end_rm_reward_func": 0.271484375,
"rewards/length_reward_func": 0.0,
"rewards/thinking_reward_func": 0.0,
"step": 71
},
{
"clip_ratio": 0.029769035056233406,
"epoch": 0.06728971962616823,
"grad_norm": 1.9238115101434357,
"kl": 0.82586669921875,
"learning_rate": 5e-07,
"loss": 0.0204,
"step": 72
},
{
"clip_ratio": 0.0,
"completion_length": 2251.5078125,
"epoch": 0.06822429906542056,
"grad_norm": 1.093553770674018,
"kl": 5.304595947265625,
"learning_rate": 5e-07,
"loss": 0.0139,
"reward": 0.3511718884110451,
"reward_std": 0.3261044346727431,
"rewards/end_of_conversation_reward_func": 0.06406250211875886,
"rewards/end_rm_reward_func": 0.287109375,
"rewards/length_reward_func": 0.0,
"rewards/thinking_reward_func": 0.0,
"step": 73
},
{
"clip_ratio": 0.029941060696728528,
"epoch": 0.0691588785046729,
"grad_norm": 0.9924767780689709,
"kl": 7.77581787109375,
"learning_rate": 5e-07,
"loss": 0.0147,
"step": 74
},
{
"clip_ratio": 0.0,
"completion_length": 2007.5625,
"epoch": 0.07009345794392523,
"grad_norm": 1.117231820935201,
"kl": 57.311798095703125,
"learning_rate": 5e-07,
"loss": 0.0187,
"reward": 0.38046876317821443,
"reward_std": 0.34462365321815014,
"rewards/end_of_conversation_reward_func": 0.07187500089639798,
"rewards/end_rm_reward_func": 0.30859375,
"rewards/length_reward_func": 0.0,
"rewards/thinking_reward_func": 0.0,
"step": 75
},
{
"clip_ratio": 0.02864129119552672,
"epoch": 0.07102803738317758,
"grad_norm": 1.0220409584650187,
"kl": 11.564544677734375,
"learning_rate": 5e-07,
"loss": 0.0191,
"step": 76
},
{
"clip_ratio": 0.0,
"completion_length": 1889.0625,
"epoch": 0.07196261682242991,
"grad_norm": 1.1712577567652858,
"kl": 0.666534423828125,
"learning_rate": 5e-07,
"loss": 0.0401,
"reward": 0.47792970621958375,
"reward_std": 0.28913656808435917,
"rewards/end_of_conversation_reward_func": 0.06484375178115442,
"rewards/end_rm_reward_func": 0.4130859375,
"rewards/length_reward_func": 0.0,
"rewards/thinking_reward_func": 0.0,
"step": 77
},
{
"clip_ratio": 0.030115694738924503,
"epoch": 0.07289719626168224,
"grad_norm": 1.0909225462290086,
"kl": 5.707000732421875,
"learning_rate": 5e-07,
"loss": 0.0407,
"step": 78
},
{
"clip_ratio": 0.0,
"completion_length": 1937.109375,
"epoch": 0.07383177570093458,
"grad_norm": 1.2803907621255146,
"kl": 21.563323974609375,
"learning_rate": 5e-07,
"loss": 0.0499,
"reward": 0.3015625071711838,
"reward_std": 0.26099140965379775,
"rewards/end_of_conversation_reward_func": 0.06328125082654878,
"rewards/end_rm_reward_func": 0.23828125,
"rewards/length_reward_func": 0.0,
"rewards/thinking_reward_func": 0.0,
"step": 79
},
{
"clip_ratio": 0.026158058433793485,
"epoch": 0.07476635514018691,
"grad_norm": 1.115605787060801,
"kl": 67.679931640625,
"learning_rate": 5e-07,
"loss": 0.0502,
"step": 80
},
{
"clip_ratio": 0.0,
"completion_length": 2278.6015625,
"epoch": 0.07570093457943926,
"grad_norm": 1.1138017988769089,
"kl": 4.114990234375,
"learning_rate": 5e-07,
"loss": 0.0362,
"reward": 0.30683594301808625,
"reward_std": 0.30768118891865015,
"rewards/end_of_conversation_reward_func": 0.05390625080326572,
"rewards/end_rm_reward_func": 0.2529296875,
"rewards/length_reward_func": 0.0,
"rewards/thinking_reward_func": 0.0,
"step": 81
},
{
"clip_ratio": 0.026793913450092077,
"epoch": 0.07663551401869159,
"grad_norm": 1.022351033383469,
"kl": 3.494293212890625,
"learning_rate": 5e-07,
"loss": 0.0367,
"step": 82
},
{
"clip_ratio": 0.0,
"completion_length": 1917.625,
"epoch": 0.07757009345794393,
"grad_norm": 1.2084387510452357,
"kl": 1553.0602111816406,
"learning_rate": 5e-07,
"loss": 0.0336,
"reward": 0.34628906892612576,
"reward_std": 0.3024712570477277,
"rewards/end_of_conversation_reward_func": 0.056250000605359674,
"rewards/end_rm_reward_func": 0.2900390625,
"rewards/length_reward_func": 0.0,
"rewards/thinking_reward_func": 0.0,
"step": 83
},
{
"clip_ratio": 0.031054493854753673,
"epoch": 0.07850467289719626,
"grad_norm": 1.0721582326117443,
"kl": 1984.7315063476562,
"learning_rate": 5e-07,
"loss": 0.034,
"step": 84
},
{
"clip_ratio": 0.0,
"completion_length": 1972.7109375,
"epoch": 0.0794392523364486,
"grad_norm": 1.2076398907858923,
"kl": 23.14263916015625,
"learning_rate": 5e-07,
"loss": 0.019,
"reward": 0.41777345445007086,
"reward_std": 0.3385109493974596,
"rewards/end_of_conversation_reward_func": 0.07890625204890966,
"rewards/end_rm_reward_func": 0.3388671875,
"rewards/length_reward_func": 0.0,
"rewards/thinking_reward_func": 0.0,
"step": 85
},
{
"clip_ratio": 0.027960697188973427,
"epoch": 0.08037383177570094,
"grad_norm": 1.0718881257851973,
"kl": 210.31060791015625,
"learning_rate": 5e-07,
"loss": 0.0195,
"step": 86
},
{
"clip_ratio": 0.0,
"completion_length": 2164.5546875,
"epoch": 0.08130841121495327,
"grad_norm": 53655.76536085255,
"kl": 251.2279052734375,
"learning_rate": 5e-07,
"loss": 9.8495,
"reward": 0.36191407358273864,
"reward_std": 0.3778584632091224,
"rewards/end_of_conversation_reward_func": 0.07968750153668225,
"rewards/end_rm_reward_func": 0.2861328125,
"rewards/length_reward_func": -0.00390625,
"rewards/thinking_reward_func": 0.0,
"step": 87
},
{
"clip_ratio": 0.02942436095327139,
"epoch": 0.08224299065420561,
"grad_norm": 3670.0837241224376,
"kl": 16.27874755859375,
"learning_rate": 5e-07,
"loss": 0.5318,
"step": 88
},
{
"clip_ratio": 0.0,
"completion_length": 2054.796875,
"epoch": 0.08317757009345794,
"grad_norm": 1.1784473544052718,
"kl": 80.361328125,
"learning_rate": 5e-07,
"loss": 0.0233,
"reward": 0.5316406297497451,
"reward_std": 0.34404546627774835,
"rewards/end_of_conversation_reward_func": 0.06093750084983185,
"rewards/end_rm_reward_func": 0.470703125,
"rewards/length_reward_func": 0.0,
"rewards/thinking_reward_func": 0.0,
"step": 89
},
{
"clip_ratio": 0.03067210176959634,
"epoch": 0.08411214953271028,
"grad_norm": 1.0999557908516517,
"kl": 232.91717529296875,
"learning_rate": 5e-07,
"loss": 0.0241,
"step": 90
},
{
"clip_ratio": 0.0,
"completion_length": 2160.5078125,
"epoch": 0.08504672897196262,
"grad_norm": 1.1685914093715908,
"kl": 13.7022705078125,
"learning_rate": 5e-07,
"loss": 0.015,
"reward": 0.39941407908918336,
"reward_std": 0.4160995290149003,
"rewards/end_of_conversation_reward_func": 0.07031250058207661,
"rewards/end_rm_reward_func": 0.3291015625,
"rewards/length_reward_func": 0.0,
"rewards/thinking_reward_func": 0.0,
"step": 91
},
{
"clip_ratio": 0.03050083527341485,
"epoch": 0.08598130841121496,
"grad_norm": 1.0383665896788337,
"kl": 6.238983154296875,
"learning_rate": 5e-07,
"loss": 0.0156,
"step": 92
},
{
"clip_ratio": 0.0,
"completion_length": 1954.2578125,
"epoch": 0.08691588785046729,
"grad_norm": 2.2687355754428173,
"kl": 65.58651733398438,
"learning_rate": 5e-07,
"loss": 0.0319,
"reward": 0.3478515758179128,
"reward_std": 0.3058149954304099,
"rewards/end_of_conversation_reward_func": 0.06953125057043508,
"rewards/end_rm_reward_func": 0.2783203125,
"rewards/length_reward_func": 0.0,
"rewards/thinking_reward_func": 0.0,
"step": 93
},
{
"clip_ratio": 0.028172997990623116,
"epoch": 0.08785046728971962,
"grad_norm": 1.6402257084650889,
"kl": 92.76519775390625,
"learning_rate": 5e-07,
"loss": 0.032,
"step": 94
},
{
"clip_ratio": 0.0,
"completion_length": 2004.828125,
"epoch": 0.08878504672897196,
"grad_norm": 1.1415388584450838,
"kl": 2546.2605895996094,
"learning_rate": 5e-07,
"loss": 0.0237,
"reward": 0.37500001094304025,
"reward_std": 0.28348651831038296,
"rewards/end_of_conversation_reward_func": 0.07812500034924597,
"rewards/end_rm_reward_func": 0.30078125,
"rewards/length_reward_func": -0.00390625,
"rewards/thinking_reward_func": 0.0,
"step": 95
},
{
"clip_ratio": 0.02549815981183201,
"epoch": 0.08971962616822429,
"grad_norm": 1.081004230814789,
"kl": 395.8102111816406,
"learning_rate": 5e-07,
"loss": 0.0243,
"step": 96
},
{
"clip_ratio": 0.0,
"completion_length": 1971.4375,
"epoch": 0.09065420560747664,
"grad_norm": 1.055902509166769,
"kl": 15.49505615234375,
"learning_rate": 5e-07,
"loss": 0.0335,
"reward": 0.40781250852160156,
"reward_std": 0.3356896792538464,
"rewards/end_of_conversation_reward_func": 0.06796875118743628,
"rewards/end_rm_reward_func": 0.33984375,
"rewards/length_reward_func": 0.0,
"rewards/thinking_reward_func": 0.0,
"step": 97
},
{
"clip_ratio": 0.02853394311387092,
"epoch": 0.09158878504672897,
"grad_norm": 1.023788947785838,
"kl": 13.60675048828125,
"learning_rate": 5e-07,
"loss": 0.034,
"step": 98
},
{
"clip_ratio": 0.0,
"completion_length": 2164.7109375,
"epoch": 0.09252336448598131,
"grad_norm": 1.5344813793778085,
"kl": 1824.4334716796875,
"learning_rate": 5e-07,
"loss": 0.0159,
"reward": 0.37500000663567334,
"reward_std": 0.32311960216611624,
"rewards/end_of_conversation_reward_func": 0.06640625116415322,
"rewards/end_rm_reward_func": 0.30859375,
"rewards/length_reward_func": 0.0,
"rewards/thinking_reward_func": 0.0,
"step": 99
},
{
"clip_ratio": 0.02664117852691561,
"epoch": 0.09345794392523364,
"grad_norm": 1.0889813227388263,
"kl": 1824.4682312011719,
"learning_rate": 5e-07,
"loss": 0.0168,
"step": 100
},
{
"clip_ratio": 0.0,
"completion_length": 2111.9609375,
"epoch": 0.09439252336448598,
"grad_norm": 0.9625091435045346,
"kl": 2.5318603515625,
"learning_rate": 5e-07,
"loss": 0.0209,
"reward": 0.33808594394940883,
"reward_std": 0.3546365643851459,
"rewards/end_of_conversation_reward_func": 0.04609375057043508,
"rewards/end_rm_reward_func": 0.2958984375,
"rewards/length_reward_func": -0.00390625,
"rewards/thinking_reward_func": 0.0,
"step": 101
},
{
"clip_ratio": 0.02783052495215088,
"epoch": 0.09532710280373832,
"grad_norm": 0.8974071545776331,
"kl": 10.89996337890625,
"learning_rate": 5e-07,
"loss": 0.0217,
"step": 102
},
{
"clip_ratio": 0.0,
"completion_length": 1706.078125,
"epoch": 0.09626168224299066,
"grad_norm": 1.0942238826222048,
"kl": 2.6583251953125,
"learning_rate": 5e-07,
"loss": 0.0271,
"reward": 0.42695313412696123,
"reward_std": 0.3482397049665451,
"rewards/end_of_conversation_reward_func": 0.07734375109430403,
"rewards/end_rm_reward_func": 0.349609375,
"rewards/length_reward_func": 0.0,
"rewards/thinking_reward_func": 0.0,
"step": 103
},
{
"clip_ratio": 0.02592139912303537,
"epoch": 0.09719626168224299,
"grad_norm": 0.9820704192096017,
"kl": 5.14837646484375,
"learning_rate": 5e-07,
"loss": 0.0277,
"step": 104
},
{
"clip_ratio": 0.0,
"completion_length": 1888.265625,
"epoch": 0.09813084112149532,
"grad_norm": 1.101428681790454,
"kl": 1.542755126953125,
"learning_rate": 5e-07,
"loss": 0.0239,
"reward": 0.26035156939178705,
"reward_std": 0.2557795748580247,
"rewards/end_of_conversation_reward_func": 0.052343750663567334,
"rewards/end_rm_reward_func": 0.2080078125,
"rewards/length_reward_func": 0.0,
"rewards/thinking_reward_func": 0.0,
"step": 105
},
{
"clip_ratio": 0.028359673800878227,
"epoch": 0.09906542056074766,
"grad_norm": 1.0325324907073261,
"kl": 1.659881591796875,
"learning_rate": 5e-07,
"loss": 0.0243,
"step": 106
},
{
"clip_ratio": 0.0,
"completion_length": 1896.203125,
"epoch": 0.1,
"grad_norm": 1.2162479513962308,
"kl": 2.1485595703125,
"learning_rate": 5e-07,
"loss": 0.0064,
"reward": 0.4125000135973096,
"reward_std": 0.38071509543806314,
"rewards/end_of_conversation_reward_func": 0.08046875102445483,
"rewards/end_rm_reward_func": 0.33203125,
"rewards/length_reward_func": 0.0,
"rewards/thinking_reward_func": 0.0,
"step": 107
},
{
"clip_ratio": 0.03139533591456711,
"epoch": 0.10093457943925234,
"grad_norm": 1.080434258639653,
"kl": 1.02691650390625,
"learning_rate": 5e-07,
"loss": 0.0069,
"step": 108
},
{
"clip_ratio": 0.0,
"completion_length": 1806.3125,
"epoch": 0.10186915887850467,
"grad_norm": 1.1593632179482682,
"kl": 2.096435546875,
"learning_rate": 5e-07,
"loss": 0.0132,
"reward": 0.3439453151077032,
"reward_std": 0.252871933626011,
"rewards/end_of_conversation_reward_func": 0.05781250086147338,
"rewards/end_rm_reward_func": 0.2861328125,
"rewards/length_reward_func": 0.0,
"rewards/thinking_reward_func": 0.0,
"step": 109
},
{
"clip_ratio": 0.024627559236250818,
"epoch": 0.102803738317757,
"grad_norm": 1.0771076027292037,
"kl": 3.48040771484375,
"learning_rate": 5e-07,
"loss": 0.0135,
"step": 110
},
{
"clip_ratio": 0.0,
"completion_length": 1964.296875,
"epoch": 0.10373831775700934,
"grad_norm": 1.0171895687624364,
"kl": 240.58120727539062,
"learning_rate": 5e-07,
"loss": 0.0527,
"reward": 0.44511719804722816,
"reward_std": 0.3740708865225315,
"rewards/end_of_conversation_reward_func": 0.06718750117579475,
"rewards/end_rm_reward_func": 0.3779296875,
"rewards/length_reward_func": 0.0,
"rewards/thinking_reward_func": 0.0,
"step": 111
},
{
"clip_ratio": 0.028232302283868194,
"epoch": 0.10467289719626169,
"grad_norm": 0.9421146301422737,
"kl": 208.3177490234375,
"learning_rate": 5e-07,
"loss": 0.0534,
"step": 112
},
{
"clip_ratio": 0.0,
"completion_length": 2157.8359375,
"epoch": 0.10560747663551402,
"grad_norm": 1641.5392722026472,
"kl": 236.9107666015625,
"learning_rate": 5e-07,
"loss": 0.0971,
"reward": 0.33730469457805157,
"reward_std": 0.3461699963081628,
"rewards/end_of_conversation_reward_func": 0.06093750044237822,
"rewards/end_rm_reward_func": 0.2763671875,
"rewards/length_reward_func": 0.0,
"rewards/thinking_reward_func": 0.0,
"step": 113
},
{
"clip_ratio": 0.029742747312411666,
"epoch": 0.10654205607476636,
"grad_norm": 2650.3812295480748,
"kl": 11.709716796875,
"learning_rate": 5e-07,
"loss": 0.2283,
"step": 114
},
{
"clip_ratio": 0.0,
"completion_length": 2026.0546875,
"epoch": 0.10747663551401869,
"grad_norm": 1.1431858031367719,
"kl": 98.2745361328125,
"learning_rate": 5e-07,
"loss": 0.0166,
"reward": 0.381835951237008,
"reward_std": 0.3533117617480457,
"rewards/end_of_conversation_reward_func": 0.06250000040745363,
"rewards/end_rm_reward_func": 0.3193359375,
"rewards/length_reward_func": 0.0,
"rewards/thinking_reward_func": 0.0,
"step": 115
},
{
"clip_ratio": 0.029958419385366142,
"epoch": 0.10841121495327102,
"grad_norm": 1.029379609705644,
"kl": 42.82733154296875,
"learning_rate": 5e-07,
"loss": 0.0172,
"step": 116
},
{
"clip_ratio": 0.0,
"completion_length": 1906.421875,
"epoch": 0.10934579439252337,
"grad_norm": 1.3109647944857514,
"kl": 2.085693359375,
"learning_rate": 5e-07,
"loss": 0.0308,
"reward": 0.3277343953959644,
"reward_std": 0.3133402925450355,
"rewards/end_of_conversation_reward_func": 0.07578125037252903,
"rewards/end_rm_reward_func": 0.251953125,
"rewards/length_reward_func": 0.0,
"rewards/thinking_reward_func": 0.0,
"step": 117
},
{
"clip_ratio": 0.028814686927944422,
"epoch": 0.1102803738317757,
"grad_norm": 1.1262854576591834,
"kl": 1.031341552734375,
"learning_rate": 5e-07,
"loss": 0.0312,
"step": 118
},
{
"clip_ratio": 0.0,
"completion_length": 2316.7578125,
"epoch": 0.11121495327102804,
"grad_norm": 1.0414340605792203,
"kl": 339.03094482421875,
"learning_rate": 5e-07,
"loss": 0.0345,
"reward": 0.3103515736875124,
"reward_std": 0.32673182454891503,
"rewards/end_of_conversation_reward_func": 0.06718750071013346,
"rewards/end_rm_reward_func": 0.2431640625,
"rewards/length_reward_func": 0.0,
"rewards/thinking_reward_func": 0.0,
"step": 119
},
{
"clip_ratio": 0.028005447005853057,
"epoch": 0.11214953271028037,
"grad_norm": 0.962355400088625,
"kl": 885.1737060546875,
"learning_rate": 5e-07,
"loss": 0.0352,
"step": 120
},
{
"clip_ratio": 0.0,
"completion_length": 1997.1796875,
"epoch": 0.1130841121495327,
"grad_norm": 1.2316757316354439,
"kl": 15193.576782226562,
"learning_rate": 5e-07,
"loss": 0.0502,
"reward": 0.3468750179745257,
"reward_std": 0.32240671874023974,
"rewards/end_of_conversation_reward_func": 0.07343750074505806,
"rewards/end_rm_reward_func": 0.2734375,
"rewards/length_reward_func": 0.0,
"rewards/thinking_reward_func": 0.0,
"step": 121
},
{
"clip_ratio": 0.0309814119245857,
"epoch": 0.11401869158878504,
"grad_norm": 1.141627400565903,
"kl": 305263.65728759766,
"learning_rate": 5e-07,
"loss": 0.0508,
"step": 122
},
{
"clip_ratio": 0.0,
"completion_length": 1961.7734375,
"epoch": 0.11495327102803739,
"grad_norm": 1.2524833933172261,
"kl": 15.526611328125,
"learning_rate": 5e-07,
"loss": 0.0052,
"reward": 0.38671876676380634,
"reward_std": 0.3171714274212718,
"rewards/end_of_conversation_reward_func": 0.07812500069849193,
"rewards/end_rm_reward_func": 0.30859375,
"rewards/length_reward_func": 0.0,
"rewards/thinking_reward_func": 0.0,
"step": 123
},
{
"clip_ratio": 0.030462613562121987,
"epoch": 0.11588785046728972,
"grad_norm": 1.1092860205417427,
"kl": 258.77789306640625,
"learning_rate": 5e-07,
"loss": 0.0053,
"step": 124
},
{
"clip_ratio": 0.0,
"completion_length": 2030.90625,
"epoch": 0.11682242990654206,
"grad_norm": 1.5604290924244164,
"kl": 37.70721435546875,
"learning_rate": 5e-07,
"loss": 0.0311,
"reward": 0.48281251545995474,
"reward_std": 0.2945442160125822,
"rewards/end_of_conversation_reward_func": 0.07656250055879354,
"rewards/end_rm_reward_func": 0.40625,
"rewards/length_reward_func": 0.0,
"rewards/thinking_reward_func": 0.0,
"step": 125
},
{
"clip_ratio": 0.03006787970662117,
"epoch": 0.11775700934579439,
"grad_norm": 1.7413118821455535,
"kl": 23.40301513671875,
"learning_rate": 5e-07,
"loss": 0.0318,
"step": 126
},
{
"clip_ratio": 0.0,
"completion_length": 2265.2421875,
"epoch": 0.11869158878504672,
"grad_norm": 1.212567517072936,
"kl": 42.730712890625,
"learning_rate": 5e-07,
"loss": 0.0065,
"reward": 0.4443359524011612,
"reward_std": 0.392240944551304,
"rewards/end_of_conversation_reward_func": 0.06250000098953024,
"rewards/end_rm_reward_func": 0.3818359375,
"rewards/length_reward_func": 0.0,
"rewards/thinking_reward_func": 0.0,
"step": 127
},
{
"clip_ratio": 0.02885347604751587,
"epoch": 0.11962616822429907,
"grad_norm": 1.1199940504331805,
"kl": 13.65277099609375,
"learning_rate": 5e-07,
"loss": 0.0069,
"step": 128
},
{
"clip_ratio": 0.0,
"completion_length": 1686.7734375,
"epoch": 0.1205607476635514,
"grad_norm": 1.2282859501226184,
"kl": 2.62017822265625,
"learning_rate": 5e-07,
"loss": -0.0218,
"reward": 0.46074219583533704,
"reward_std": 0.32941993116401136,
"rewards/end_of_conversation_reward_func": 0.059375000826548785,
"rewards/end_rm_reward_func": 0.4013671875,
"rewards/length_reward_func": 0.0,
"rewards/thinking_reward_func": 0.0,
"step": 129
},
{
"clip_ratio": 0.029977424652315676,
"epoch": 0.12149532710280374,
"grad_norm": 1.201467543453189,
"kl": 4.4796142578125,
"learning_rate": 5e-07,
"loss": -0.0214,
"step": 130
}
],
"logging_steps": 1,
"max_steps": 1070,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 10,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}