| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.12149532710280374, |
| "eval_steps": 500, |
| "global_step": 130, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 2213.6484375, |
| "epoch": 0.0009345794392523365, |
| "grad_norm": 0.9274865773093601, |
| "kl": 0.0, |
| "learning_rate": 5e-07, |
| "loss": 0.0131, |
| "reward": 0.3671875114669092, |
| "reward_std": 0.3501587579958141, |
| "rewards/end_of_conversation_reward_func": 0.05468750116415322, |
| "rewards/end_rm_reward_func": 0.3125, |
| "rewards/length_reward_func": 0.0, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 1 |
| }, |
| { |
| "clip_ratio": 0.025033144396729767, |
| "epoch": 0.001869158878504673, |
| "grad_norm": 0.8911286780735345, |
| "kl": 0.006732940673828125, |
| "learning_rate": 5e-07, |
| "loss": 0.0136, |
| "step": 2 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1924.0390625, |
| "epoch": 0.002803738317757009, |
| "grad_norm": 1.0812826786849123, |
| "kl": 0.007961273193359375, |
| "learning_rate": 5e-07, |
| "loss": 0.0103, |
| "reward": 0.30449220282025635, |
| "reward_std": 0.3021779216360301, |
| "rewards/end_of_conversation_reward_func": 0.07890625158324838, |
| "rewards/end_rm_reward_func": 0.2255859375, |
| "rewards/length_reward_func": 0.0, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 3 |
| }, |
| { |
| "clip_ratio": 0.02741119544953108, |
| "epoch": 0.003738317757009346, |
| "grad_norm": 1.0223081613434297, |
| "kl": 0.007640838623046875, |
| "learning_rate": 5e-07, |
| "loss": 0.0106, |
| "step": 4 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 2317.28125, |
| "epoch": 0.004672897196261682, |
| "grad_norm": 0.9173763955261445, |
| "kl": 0.006732940673828125, |
| "learning_rate": 5e-07, |
| "loss": 0.0504, |
| "reward": 0.23222656873986125, |
| "reward_std": 0.2611159069929272, |
| "rewards/end_of_conversation_reward_func": 0.0632812503608875, |
| "rewards/end_rm_reward_func": 0.1689453125, |
| "rewards/length_reward_func": 0.0, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 5 |
| }, |
| { |
| "clip_ratio": 0.025625309790484607, |
| "epoch": 0.005607476635514018, |
| "grad_norm": 0.9085778304220192, |
| "kl": 0.0066356658935546875, |
| "learning_rate": 5e-07, |
| "loss": 0.0505, |
| "step": 6 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1964.3046875, |
| "epoch": 0.0065420560747663555, |
| "grad_norm": 1.0789728731378576, |
| "kl": 0.00862884521484375, |
| "learning_rate": 5e-07, |
| "loss": 0.0348, |
| "reward": 0.39062501583248377, |
| "reward_std": 0.3314252281561494, |
| "rewards/end_of_conversation_reward_func": 0.0703125016298145, |
| "rewards/end_rm_reward_func": 0.3203125, |
| "rewards/length_reward_func": 0.0, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 7 |
| }, |
| { |
| "clip_ratio": 0.027227300102822483, |
| "epoch": 0.007476635514018692, |
| "grad_norm": 1.012376163737541, |
| "kl": 0.009044647216796875, |
| "learning_rate": 5e-07, |
| "loss": 0.0351, |
| "step": 8 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 2030.8828125, |
| "epoch": 0.008411214953271028, |
| "grad_norm": 0.9821357312327096, |
| "kl": 0.00811767578125, |
| "learning_rate": 5e-07, |
| "loss": 0.0024, |
| "reward": 0.4171875137835741, |
| "reward_std": 0.31918896292336285, |
| "rewards/end_of_conversation_reward_func": 0.06171875057043508, |
| "rewards/end_rm_reward_func": 0.35546875, |
| "rewards/length_reward_func": 0.0, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 9 |
| }, |
| { |
| "clip_ratio": 0.02804350107908249, |
| "epoch": 0.009345794392523364, |
| "grad_norm": 0.9187020172705397, |
| "kl": 0.01328277587890625, |
| "learning_rate": 5e-07, |
| "loss": 0.0028, |
| "step": 10 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 2098.609375, |
| "epoch": 0.010280373831775701, |
| "grad_norm": 1.0389274688932995, |
| "kl": 0.013088226318359375, |
| "learning_rate": 5e-07, |
| "loss": 0.0005, |
| "reward": 0.38964845007285476, |
| "reward_std": 0.3262035925872624, |
| "rewards/end_of_conversation_reward_func": 0.07031250116415322, |
| "rewards/end_rm_reward_func": 0.3193359375, |
| "rewards/length_reward_func": 0.0, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 11 |
| }, |
| { |
| "clip_ratio": 0.02878896868787706, |
| "epoch": 0.011214953271028037, |
| "grad_norm": 1.005444013858441, |
| "kl": 0.015062332153320312, |
| "learning_rate": 5e-07, |
| "loss": 0.0011, |
| "step": 12 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1921.2734375, |
| "epoch": 0.012149532710280374, |
| "grad_norm": 1.0317328787031124, |
| "kl": 0.013071060180664062, |
| "learning_rate": 5e-07, |
| "loss": 0.0341, |
| "reward": 0.3585937600582838, |
| "reward_std": 0.3541586115024984, |
| "rewards/end_of_conversation_reward_func": 0.06171875016298145, |
| "rewards/end_rm_reward_func": 0.296875, |
| "rewards/length_reward_func": 0.0, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 13 |
| }, |
| { |
| "clip_ratio": 0.029582467046566308, |
| "epoch": 0.013084112149532711, |
| "grad_norm": 0.9635840549431879, |
| "kl": 0.01287078857421875, |
| "learning_rate": 5e-07, |
| "loss": 0.0345, |
| "step": 14 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 2046.6484375, |
| "epoch": 0.014018691588785047, |
| "grad_norm": 1.0353766722235376, |
| "kl": 0.012033462524414062, |
| "learning_rate": 5e-07, |
| "loss": 0.0376, |
| "reward": 0.31542970187729225, |
| "reward_std": 0.308479615021497, |
| "rewards/end_of_conversation_reward_func": 0.054687501513399184, |
| "rewards/end_rm_reward_func": 0.2607421875, |
| "rewards/length_reward_func": 0.0, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 15 |
| }, |
| { |
| "clip_ratio": 0.028461063280701637, |
| "epoch": 0.014953271028037384, |
| "grad_norm": 0.9802840358814225, |
| "kl": 0.013523101806640625, |
| "learning_rate": 5e-07, |
| "loss": 0.0384, |
| "step": 16 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1996.390625, |
| "epoch": 0.01588785046728972, |
| "grad_norm": 1.0375805713420467, |
| "kl": 0.014972686767578125, |
| "learning_rate": 5e-07, |
| "loss": 0.0257, |
| "reward": 0.3818359471624717, |
| "reward_std": 0.2914374100510031, |
| "rewards/end_of_conversation_reward_func": 0.058593750873114914, |
| "rewards/end_rm_reward_func": 0.3232421875, |
| "rewards/length_reward_func": 0.0, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 17 |
| }, |
| { |
| "clip_ratio": 0.027674222481437027, |
| "epoch": 0.016822429906542057, |
| "grad_norm": 1.0160437873794967, |
| "kl": 0.02030181884765625, |
| "learning_rate": 5e-07, |
| "loss": 0.0264, |
| "step": 18 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1832.9375, |
| "epoch": 0.017757009345794394, |
| "grad_norm": 1.0312841889104536, |
| "kl": 0.033977508544921875, |
| "learning_rate": 5e-07, |
| "loss": 0.0194, |
| "reward": 0.40410157246515155, |
| "reward_std": 0.3510888592572883, |
| "rewards/end_of_conversation_reward_func": 0.059375000768341124, |
| "rewards/end_rm_reward_func": 0.3447265625, |
| "rewards/length_reward_func": 0.0, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 19 |
| }, |
| { |
| "clip_ratio": 0.027633688994683325, |
| "epoch": 0.018691588785046728, |
| "grad_norm": 0.9678211628152914, |
| "kl": 7.021579742431641, |
| "learning_rate": 5e-07, |
| "loss": 0.0196, |
| "step": 20 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 2241.8359375, |
| "epoch": 0.019626168224299065, |
| "grad_norm": 1.0128829331154197, |
| "kl": 0.0332489013671875, |
| "learning_rate": 5e-07, |
| "loss": 0.0264, |
| "reward": 0.2441406348370947, |
| "reward_std": 0.2879671745467931, |
| "rewards/end_of_conversation_reward_func": 0.07421875081490725, |
| "rewards/end_rm_reward_func": 0.169921875, |
| "rewards/length_reward_func": 0.0, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 21 |
| }, |
| { |
| "clip_ratio": 0.02383307204581797, |
| "epoch": 0.020560747663551402, |
| "grad_norm": 0.9296707720991904, |
| "kl": 0.040599822998046875, |
| "learning_rate": 5e-07, |
| "loss": 0.0268, |
| "step": 22 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 2584.5, |
| "epoch": 0.02149532710280374, |
| "grad_norm": 0.8926960463526834, |
| "kl": 0.7261238098144531, |
| "learning_rate": 5e-07, |
| "loss": 0.0246, |
| "reward": 0.3001953187631443, |
| "reward_std": 0.31693244143389165, |
| "rewards/end_of_conversation_reward_func": 0.05703125102445483, |
| "rewards/end_rm_reward_func": 0.2431640625, |
| "rewards/length_reward_func": 0.0, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 23 |
| }, |
| { |
| "clip_ratio": 0.025873058126308024, |
| "epoch": 0.022429906542056073, |
| "grad_norm": 0.8345013055872595, |
| "kl": 0.16021347045898438, |
| "learning_rate": 5e-07, |
| "loss": 0.025, |
| "step": 24 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1748.03125, |
| "epoch": 0.02336448598130841, |
| "grad_norm": 1.1173548960610717, |
| "kl": 0.04956817626953125, |
| "learning_rate": 5e-07, |
| "loss": -0.0013, |
| "reward": 0.5044922037050128, |
| "reward_std": 0.407341014361009, |
| "rewards/end_of_conversation_reward_func": 0.06796875083819032, |
| "rewards/end_rm_reward_func": 0.4365234375, |
| "rewards/length_reward_func": 0.0, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 25 |
| }, |
| { |
| "clip_ratio": 0.029045677627436817, |
| "epoch": 0.024299065420560748, |
| "grad_norm": 1.0584679999454167, |
| "kl": 0.05883026123046875, |
| "learning_rate": 5e-07, |
| "loss": -0.0007, |
| "step": 26 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1884.03125, |
| "epoch": 0.025233644859813085, |
| "grad_norm": 1.0128102944540516, |
| "kl": 0.0649871826171875, |
| "learning_rate": 5e-07, |
| "loss": 0.0393, |
| "reward": 0.4763671956025064, |
| "reward_std": 0.3862752839922905, |
| "rewards/end_of_conversation_reward_func": 0.059375000768341124, |
| "rewards/end_rm_reward_func": 0.4169921875, |
| "rewards/length_reward_func": 0.0, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 27 |
| }, |
| { |
| "clip_ratio": 0.028907910105772316, |
| "epoch": 0.026168224299065422, |
| "grad_norm": 0.9150592324011979, |
| "kl": 0.1536865234375, |
| "learning_rate": 5e-07, |
| "loss": 0.0398, |
| "step": 28 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1675.265625, |
| "epoch": 0.027102803738317756, |
| "grad_norm": 1.184705074990119, |
| "kl": 0.241058349609375, |
| "learning_rate": 5e-07, |
| "loss": 0.0311, |
| "reward": 0.3828125111758709, |
| "reward_std": 0.31181320175528526, |
| "rewards/end_of_conversation_reward_func": 0.05859375064028427, |
| "rewards/end_rm_reward_func": 0.32421875, |
| "rewards/length_reward_func": 0.0, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 29 |
| }, |
| { |
| "clip_ratio": 0.02990245760884136, |
| "epoch": 0.028037383177570093, |
| "grad_norm": 1.1428586426066105, |
| "kl": 3.1220054626464844, |
| "learning_rate": 5e-07, |
| "loss": 0.0316, |
| "step": 30 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 2076.8046875, |
| "epoch": 0.02897196261682243, |
| "grad_norm": 1.1023210621247226, |
| "kl": 0.1524505615234375, |
| "learning_rate": 5e-07, |
| "loss": 0.0032, |
| "reward": 0.3175781366880983, |
| "reward_std": 0.3118708392139524, |
| "rewards/end_of_conversation_reward_func": 0.06562500062864274, |
| "rewards/end_rm_reward_func": 0.251953125, |
| "rewards/length_reward_func": 0.0, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 31 |
| }, |
| { |
| "clip_ratio": 0.028195369872264564, |
| "epoch": 0.029906542056074768, |
| "grad_norm": 1.065397234398234, |
| "kl": 0.14312744140625, |
| "learning_rate": 5e-07, |
| "loss": 0.0037, |
| "step": 32 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1835.15625, |
| "epoch": 0.0308411214953271, |
| "grad_norm": 1.0883386533644912, |
| "kl": 0.1009674072265625, |
| "learning_rate": 5e-07, |
| "loss": 0.006, |
| "reward": 0.47109376499429345, |
| "reward_std": 0.4281590711325407, |
| "rewards/end_of_conversation_reward_func": 0.06875000108266249, |
| "rewards/end_rm_reward_func": 0.40234375, |
| "rewards/length_reward_func": 0.0, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 33 |
| }, |
| { |
| "clip_ratio": 0.030578713049180806, |
| "epoch": 0.03177570093457944, |
| "grad_norm": 1.0103915525345888, |
| "kl": 0.08032989501953125, |
| "learning_rate": 5e-07, |
| "loss": 0.0067, |
| "step": 34 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 2264.5703125, |
| "epoch": 0.03271028037383177, |
| "grad_norm": 1.0203057509066855, |
| "kl": 2.56329345703125, |
| "learning_rate": 5e-07, |
| "loss": -0.0068, |
| "reward": 0.314453131519258, |
| "reward_std": 0.2885858778608963, |
| "rewards/end_of_conversation_reward_func": 0.06640625017462298, |
| "rewards/end_rm_reward_func": 0.248046875, |
| "rewards/length_reward_func": 0.0, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 35 |
| }, |
| { |
| "clip_ratio": 0.028288635658100247, |
| "epoch": 0.03364485981308411, |
| "grad_norm": 0.9275253113815312, |
| "kl": 2.92437744140625, |
| "learning_rate": 5e-07, |
| "loss": -0.0062, |
| "step": 36 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 2376.609375, |
| "epoch": 0.03457943925233645, |
| "grad_norm": 0.9618697576631778, |
| "kl": 0.0479736328125, |
| "learning_rate": 5e-07, |
| "loss": -0.0045, |
| "reward": 0.28437500644940883, |
| "reward_std": 0.3411878102924675, |
| "rewards/end_of_conversation_reward_func": 0.053906251036096364, |
| "rewards/end_rm_reward_func": 0.23046875, |
| "rewards/length_reward_func": 0.0, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 37 |
| }, |
| { |
| "clip_ratio": 0.024988370947539806, |
| "epoch": 0.03551401869158879, |
| "grad_norm": 0.8895332217423063, |
| "kl": 0.04958343505859375, |
| "learning_rate": 5e-07, |
| "loss": -0.0041, |
| "step": 38 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1902.1640625, |
| "epoch": 0.03644859813084112, |
| "grad_norm": 1.107891872929539, |
| "kl": 11.097213745117188, |
| "learning_rate": 5e-07, |
| "loss": 0.0181, |
| "reward": 0.3992187549592927, |
| "reward_std": 0.31114612985402346, |
| "rewards/end_of_conversation_reward_func": 0.07109375146683306, |
| "rewards/end_rm_reward_func": 0.328125, |
| "rewards/length_reward_func": 0.0, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 39 |
| }, |
| { |
| "clip_ratio": 0.02839432912878692, |
| "epoch": 0.037383177570093455, |
| "grad_norm": 1.0581025263351234, |
| "kl": 1.0611953735351562, |
| "learning_rate": 5e-07, |
| "loss": 0.0188, |
| "step": 40 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1947.9765625, |
| "epoch": 0.038317757009345796, |
| "grad_norm": 1.044504336049714, |
| "kl": 2384.179656982422, |
| "learning_rate": 5e-07, |
| "loss": 0.0119, |
| "reward": 0.349609378259629, |
| "reward_std": 0.365347285522148, |
| "rewards/end_of_conversation_reward_func": 0.07031250110594556, |
| "rewards/end_rm_reward_func": 0.279296875, |
| "rewards/length_reward_func": 0.0, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 41 |
| }, |
| { |
| "clip_ratio": 0.0260704318061471, |
| "epoch": 0.03925233644859813, |
| "grad_norm": 0.9994876043774379, |
| "kl": 992.1758270263672, |
| "learning_rate": 5e-07, |
| "loss": 0.0123, |
| "step": 42 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1981.40625, |
| "epoch": 0.04018691588785047, |
| "grad_norm": 1.13597665166102, |
| "kl": 4.000732421875, |
| "learning_rate": 5e-07, |
| "loss": 0.0065, |
| "reward": 0.28535156534053385, |
| "reward_std": 0.2903470410965383, |
| "rewards/end_of_conversation_reward_func": 0.05000000126892701, |
| "rewards/end_rm_reward_func": 0.2353515625, |
| "rewards/length_reward_func": 0.0, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 43 |
| }, |
| { |
| "clip_ratio": 0.028302057995460927, |
| "epoch": 0.041121495327102804, |
| "grad_norm": 0.9919056528612524, |
| "kl": 0.5160751342773438, |
| "learning_rate": 5e-07, |
| "loss": 0.0069, |
| "step": 44 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 2229.96875, |
| "epoch": 0.04205607476635514, |
| "grad_norm": 1.1389480742032796, |
| "kl": 0.0689239501953125, |
| "learning_rate": 5e-07, |
| "loss": 0.0238, |
| "reward": 0.4009765740483999, |
| "reward_std": 0.32002427359111607, |
| "rewards/end_of_conversation_reward_func": 0.0601562510128133, |
| "rewards/end_rm_reward_func": 0.3408203125, |
| "rewards/length_reward_func": 0.0, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 45 |
| }, |
| { |
| "clip_ratio": 0.028976862784475088, |
| "epoch": 0.04299065420560748, |
| "grad_norm": 1.0789195240827836, |
| "kl": 0.471649169921875, |
| "learning_rate": 5e-07, |
| "loss": 0.0244, |
| "step": 46 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 2015.75, |
| "epoch": 0.04392523364485981, |
| "grad_norm": 1.0614224174398665, |
| "kl": 290.30157470703125, |
| "learning_rate": 5e-07, |
| "loss": 0.0326, |
| "reward": 0.43984375934815034, |
| "reward_std": 0.35610848292708397, |
| "rewards/end_of_conversation_reward_func": 0.06875000090803951, |
| "rewards/end_rm_reward_func": 0.37109375, |
| "rewards/length_reward_func": 0.0, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 47 |
| }, |
| { |
| "clip_ratio": 0.029352403013035655, |
| "epoch": 0.044859813084112146, |
| "grad_norm": 1.011898536861839, |
| "kl": 478.43064880371094, |
| "learning_rate": 5e-07, |
| "loss": 0.0332, |
| "step": 48 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 2297.375, |
| "epoch": 0.04579439252336449, |
| "grad_norm": 1.0458046562803893, |
| "kl": 0.3959197998046875, |
| "learning_rate": 5e-07, |
| "loss": 0.0264, |
| "reward": 0.40878907358273864, |
| "reward_std": 0.41261982172727585, |
| "rewards/end_of_conversation_reward_func": 0.06015625095460564, |
| "rewards/end_rm_reward_func": 0.3486328125, |
| "rewards/length_reward_func": 0.0, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 49 |
| }, |
| { |
| "clip_ratio": 0.02658259856980294, |
| "epoch": 0.04672897196261682, |
| "grad_norm": 1.0013387198061083, |
| "kl": 0.1452178955078125, |
| "learning_rate": 5e-07, |
| "loss": 0.027, |
| "step": 50 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1941.359375, |
| "epoch": 0.04766355140186916, |
| "grad_norm": 1.043020590108944, |
| "kl": 0.4831390380859375, |
| "learning_rate": 5e-07, |
| "loss": 0.0261, |
| "reward": 0.36464845400769264, |
| "reward_std": 0.2816849281080067, |
| "rewards/end_of_conversation_reward_func": 0.06875000079162419, |
| "rewards/end_rm_reward_func": 0.2958984375, |
| "rewards/length_reward_func": 0.0, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 51 |
| }, |
| { |
| "clip_ratio": 0.026326473569497466, |
| "epoch": 0.048598130841121495, |
| "grad_norm": 0.9484135414702769, |
| "kl": 0.2133331298828125, |
| "learning_rate": 5e-07, |
| "loss": 0.0265, |
| "step": 52 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 2065.890625, |
| "epoch": 0.04953271028037383, |
| "grad_norm": 1.0419315292071203, |
| "kl": 0.9266204833984375, |
| "learning_rate": 5e-07, |
| "loss": 0.0207, |
| "reward": 0.4900390843395144, |
| "reward_std": 0.30717412871308625, |
| "rewards/end_of_conversation_reward_func": 0.07109375123400241, |
| "rewards/end_rm_reward_func": 0.4189453125, |
| "rewards/length_reward_func": 0.0, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 53 |
| }, |
| { |
| "clip_ratio": 0.0251879645511508, |
| "epoch": 0.05046728971962617, |
| "grad_norm": 0.9533110472132086, |
| "kl": 0.435760498046875, |
| "learning_rate": 5e-07, |
| "loss": 0.0213, |
| "step": 54 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1789.953125, |
| "epoch": 0.0514018691588785, |
| "grad_norm": 1.1217425421263005, |
| "kl": 0.214447021484375, |
| "learning_rate": 5e-07, |
| "loss": 0.0412, |
| "reward": 0.4402343900874257, |
| "reward_std": 0.3780560018494725, |
| "rewards/end_of_conversation_reward_func": 0.07890625135041773, |
| "rewards/end_rm_reward_func": 0.361328125, |
| "rewards/length_reward_func": 0.0, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 55 |
| }, |
| { |
| "clip_ratio": 0.030622108606621623, |
| "epoch": 0.052336448598130844, |
| "grad_norm": 1.0381235178271138, |
| "kl": 0.4748382568359375, |
| "learning_rate": 5e-07, |
| "loss": 0.0415, |
| "step": 56 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 2055.359375, |
| "epoch": 0.05327102803738318, |
| "grad_norm": 1.1876220917504008, |
| "kl": 1.00286865234375, |
| "learning_rate": 5e-07, |
| "loss": 0.0177, |
| "reward": 0.37011719890870154, |
| "reward_std": 0.3370174712035805, |
| "rewards/end_of_conversation_reward_func": 0.07421875145519152, |
| "rewards/end_rm_reward_func": 0.2958984375, |
| "rewards/length_reward_func": 0.0, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 57 |
| }, |
| { |
| "clip_ratio": 0.02977730438578874, |
| "epoch": 0.05420560747663551, |
| "grad_norm": 1.068202991310054, |
| "kl": 0.8372344970703125, |
| "learning_rate": 5e-07, |
| "loss": 0.0182, |
| "step": 58 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 2018.375, |
| "epoch": 0.05514018691588785, |
| "grad_norm": 1.1523841294545756, |
| "kl": 1.866424560546875, |
| "learning_rate": 5e-07, |
| "loss": 0.0263, |
| "reward": 0.41113282507285476, |
| "reward_std": 0.3193635992356576, |
| "rewards/end_of_conversation_reward_func": 0.07031250093132257, |
| "rewards/end_rm_reward_func": 0.3408203125, |
| "rewards/length_reward_func": 0.0, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 59 |
| }, |
| { |
| "clip_ratio": 0.02743027382530272, |
| "epoch": 0.056074766355140186, |
| "grad_norm": 1.045749009073609, |
| "kl": 0.7020263671875, |
| "learning_rate": 5e-07, |
| "loss": 0.0268, |
| "step": 60 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1794.4140625, |
| "epoch": 0.05700934579439252, |
| "grad_norm": 1.0912447227651245, |
| "kl": 0.14093017578125, |
| "learning_rate": 5e-07, |
| "loss": -0.0038, |
| "reward": 0.36425782297737896, |
| "reward_std": 0.32298252754844725, |
| "rewards/end_of_conversation_reward_func": 0.06250000034924597, |
| "rewards/end_rm_reward_func": 0.3017578125, |
| "rewards/length_reward_func": 0.0, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 61 |
| }, |
| { |
| "clip_ratio": 0.02964514575432986, |
| "epoch": 0.05794392523364486, |
| "grad_norm": 1.018769038639198, |
| "kl": 0.180816650390625, |
| "learning_rate": 5e-07, |
| "loss": -0.0035, |
| "step": 62 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 2075.7890625, |
| "epoch": 0.058878504672897194, |
| "grad_norm": 1.1420919911365868, |
| "kl": 2.07659912109375, |
| "learning_rate": 5e-07, |
| "loss": 0.0474, |
| "reward": 0.4462890678551048, |
| "reward_std": 0.3219987105112523, |
| "rewards/end_of_conversation_reward_func": 0.05078125046566129, |
| "rewards/end_rm_reward_func": 0.3955078125, |
| "rewards/length_reward_func": 0.0, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 63 |
| }, |
| { |
| "clip_ratio": 0.026783736771903932, |
| "epoch": 0.059813084112149535, |
| "grad_norm": 1.0783334603422854, |
| "kl": 0.763427734375, |
| "learning_rate": 5e-07, |
| "loss": 0.0478, |
| "step": 64 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1749.0703125, |
| "epoch": 0.06074766355140187, |
| "grad_norm": 11.641215369666991, |
| "kl": 1.021759033203125, |
| "learning_rate": 5e-07, |
| "loss": -0.0107, |
| "reward": 0.38496094732545316, |
| "reward_std": 0.28824072727002203, |
| "rewards/end_of_conversation_reward_func": 0.06171875091968104, |
| "rewards/end_rm_reward_func": 0.3232421875, |
| "rewards/length_reward_func": 0.0, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 65 |
| }, |
| { |
| "clip_ratio": 0.026883197599090636, |
| "epoch": 0.0616822429906542, |
| "grad_norm": 1.1308530803681143, |
| "kl": 0.78778076171875, |
| "learning_rate": 5e-07, |
| "loss": -0.0124, |
| "step": 66 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 2340.5703125, |
| "epoch": 0.06261682242990654, |
| "grad_norm": 1.0325499609090654, |
| "kl": 0.6973876953125, |
| "learning_rate": 5e-07, |
| "loss": 0.0291, |
| "reward": 0.2919921954162419, |
| "reward_std": 0.372808160725981, |
| "rewards/end_of_conversation_reward_func": 0.0625000010477379, |
| "rewards/end_rm_reward_func": 0.2294921875, |
| "rewards/length_reward_func": 0.0, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 67 |
| }, |
| { |
| "clip_ratio": 0.024969650781713426, |
| "epoch": 0.06355140186915888, |
| "grad_norm": 0.9501171001953348, |
| "kl": 0.953125, |
| "learning_rate": 5e-07, |
| "loss": 0.0295, |
| "step": 68 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1993.921875, |
| "epoch": 0.06448598130841121, |
| "grad_norm": 1.371250349290397, |
| "kl": 48.9132080078125, |
| "learning_rate": 5e-07, |
| "loss": 0.0288, |
| "reward": 0.33339844923466444, |
| "reward_std": 0.3135456896852702, |
| "rewards/end_of_conversation_reward_func": 0.07656250102445483, |
| "rewards/end_rm_reward_func": 0.2568359375, |
| "rewards/length_reward_func": 0.0, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 69 |
| }, |
| { |
| "clip_ratio": 0.02892337238881737, |
| "epoch": 0.06542056074766354, |
| "grad_norm": 1.113720408056213, |
| "kl": 1089.4842834472656, |
| "learning_rate": 5e-07, |
| "loss": 0.0292, |
| "step": 70 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 2149.3671875, |
| "epoch": 0.06635514018691589, |
| "grad_norm": 65.96446607868502, |
| "kl": 1.7677001953125, |
| "learning_rate": 5e-07, |
| "loss": 0.032, |
| "reward": 0.33554687892319635, |
| "reward_std": 0.2645741559099406, |
| "rewards/end_of_conversation_reward_func": 0.06406250077998266, |
| "rewards/end_rm_reward_func": 0.271484375, |
| "rewards/length_reward_func": 0.0, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 71 |
| }, |
| { |
| "clip_ratio": 0.029769035056233406, |
| "epoch": 0.06728971962616823, |
| "grad_norm": 1.9238115101434357, |
| "kl": 0.82586669921875, |
| "learning_rate": 5e-07, |
| "loss": 0.0204, |
| "step": 72 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 2251.5078125, |
| "epoch": 0.06822429906542056, |
| "grad_norm": 1.093553770674018, |
| "kl": 5.304595947265625, |
| "learning_rate": 5e-07, |
| "loss": 0.0139, |
| "reward": 0.3511718884110451, |
| "reward_std": 0.3261044346727431, |
| "rewards/end_of_conversation_reward_func": 0.06406250211875886, |
| "rewards/end_rm_reward_func": 0.287109375, |
| "rewards/length_reward_func": 0.0, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 73 |
| }, |
| { |
| "clip_ratio": 0.029941060696728528, |
| "epoch": 0.0691588785046729, |
| "grad_norm": 0.9924767780689709, |
| "kl": 7.77581787109375, |
| "learning_rate": 5e-07, |
| "loss": 0.0147, |
| "step": 74 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 2007.5625, |
| "epoch": 0.07009345794392523, |
| "grad_norm": 1.117231820935201, |
| "kl": 57.311798095703125, |
| "learning_rate": 5e-07, |
| "loss": 0.0187, |
| "reward": 0.38046876317821443, |
| "reward_std": 0.34462365321815014, |
| "rewards/end_of_conversation_reward_func": 0.07187500089639798, |
| "rewards/end_rm_reward_func": 0.30859375, |
| "rewards/length_reward_func": 0.0, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 75 |
| }, |
| { |
| "clip_ratio": 0.02864129119552672, |
| "epoch": 0.07102803738317758, |
| "grad_norm": 1.0220409584650187, |
| "kl": 11.564544677734375, |
| "learning_rate": 5e-07, |
| "loss": 0.0191, |
| "step": 76 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1889.0625, |
| "epoch": 0.07196261682242991, |
| "grad_norm": 1.1712577567652858, |
| "kl": 0.666534423828125, |
| "learning_rate": 5e-07, |
| "loss": 0.0401, |
| "reward": 0.47792970621958375, |
| "reward_std": 0.28913656808435917, |
| "rewards/end_of_conversation_reward_func": 0.06484375178115442, |
| "rewards/end_rm_reward_func": 0.4130859375, |
| "rewards/length_reward_func": 0.0, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 77 |
| }, |
| { |
| "clip_ratio": 0.030115694738924503, |
| "epoch": 0.07289719626168224, |
| "grad_norm": 1.0909225462290086, |
| "kl": 5.707000732421875, |
| "learning_rate": 5e-07, |
| "loss": 0.0407, |
| "step": 78 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1937.109375, |
| "epoch": 0.07383177570093458, |
| "grad_norm": 1.2803907621255146, |
| "kl": 21.563323974609375, |
| "learning_rate": 5e-07, |
| "loss": 0.0499, |
| "reward": 0.3015625071711838, |
| "reward_std": 0.26099140965379775, |
| "rewards/end_of_conversation_reward_func": 0.06328125082654878, |
| "rewards/end_rm_reward_func": 0.23828125, |
| "rewards/length_reward_func": 0.0, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 79 |
| }, |
| { |
| "clip_ratio": 0.026158058433793485, |
| "epoch": 0.07476635514018691, |
| "grad_norm": 1.115605787060801, |
| "kl": 67.679931640625, |
| "learning_rate": 5e-07, |
| "loss": 0.0502, |
| "step": 80 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 2278.6015625, |
| "epoch": 0.07570093457943926, |
| "grad_norm": 1.1138017988769089, |
| "kl": 4.114990234375, |
| "learning_rate": 5e-07, |
| "loss": 0.0362, |
| "reward": 0.30683594301808625, |
| "reward_std": 0.30768118891865015, |
| "rewards/end_of_conversation_reward_func": 0.05390625080326572, |
| "rewards/end_rm_reward_func": 0.2529296875, |
| "rewards/length_reward_func": 0.0, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 81 |
| }, |
| { |
| "clip_ratio": 0.026793913450092077, |
| "epoch": 0.07663551401869159, |
| "grad_norm": 1.022351033383469, |
| "kl": 3.494293212890625, |
| "learning_rate": 5e-07, |
| "loss": 0.0367, |
| "step": 82 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1917.625, |
| "epoch": 0.07757009345794393, |
| "grad_norm": 1.2084387510452357, |
| "kl": 1553.0602111816406, |
| "learning_rate": 5e-07, |
| "loss": 0.0336, |
| "reward": 0.34628906892612576, |
| "reward_std": 0.3024712570477277, |
| "rewards/end_of_conversation_reward_func": 0.056250000605359674, |
| "rewards/end_rm_reward_func": 0.2900390625, |
| "rewards/length_reward_func": 0.0, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 83 |
| }, |
| { |
| "clip_ratio": 0.031054493854753673, |
| "epoch": 0.07850467289719626, |
| "grad_norm": 1.0721582326117443, |
| "kl": 1984.7315063476562, |
| "learning_rate": 5e-07, |
| "loss": 0.034, |
| "step": 84 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1972.7109375, |
| "epoch": 0.0794392523364486, |
| "grad_norm": 1.2076398907858923, |
| "kl": 23.14263916015625, |
| "learning_rate": 5e-07, |
| "loss": 0.019, |
| "reward": 0.41777345445007086, |
| "reward_std": 0.3385109493974596, |
| "rewards/end_of_conversation_reward_func": 0.07890625204890966, |
| "rewards/end_rm_reward_func": 0.3388671875, |
| "rewards/length_reward_func": 0.0, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 85 |
| }, |
| { |
| "clip_ratio": 0.027960697188973427, |
| "epoch": 0.08037383177570094, |
| "grad_norm": 1.0718881257851973, |
| "kl": 210.31060791015625, |
| "learning_rate": 5e-07, |
| "loss": 0.0195, |
| "step": 86 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 2164.5546875, |
| "epoch": 0.08130841121495327, |
| "grad_norm": 53655.76536085255, |
| "kl": 251.2279052734375, |
| "learning_rate": 5e-07, |
| "loss": 9.8495, |
| "reward": 0.36191407358273864, |
| "reward_std": 0.3778584632091224, |
| "rewards/end_of_conversation_reward_func": 0.07968750153668225, |
| "rewards/end_rm_reward_func": 0.2861328125, |
| "rewards/length_reward_func": -0.00390625, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 87 |
| }, |
| { |
| "clip_ratio": 0.02942436095327139, |
| "epoch": 0.08224299065420561, |
| "grad_norm": 3670.0837241224376, |
| "kl": 16.27874755859375, |
| "learning_rate": 5e-07, |
| "loss": 0.5318, |
| "step": 88 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 2054.796875, |
| "epoch": 0.08317757009345794, |
| "grad_norm": 1.1784473544052718, |
| "kl": 80.361328125, |
| "learning_rate": 5e-07, |
| "loss": 0.0233, |
| "reward": 0.5316406297497451, |
| "reward_std": 0.34404546627774835, |
| "rewards/end_of_conversation_reward_func": 0.06093750084983185, |
| "rewards/end_rm_reward_func": 0.470703125, |
| "rewards/length_reward_func": 0.0, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 89 |
| }, |
| { |
| "clip_ratio": 0.03067210176959634, |
| "epoch": 0.08411214953271028, |
| "grad_norm": 1.0999557908516517, |
| "kl": 232.91717529296875, |
| "learning_rate": 5e-07, |
| "loss": 0.0241, |
| "step": 90 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 2160.5078125, |
| "epoch": 0.08504672897196262, |
| "grad_norm": 1.1685914093715908, |
| "kl": 13.7022705078125, |
| "learning_rate": 5e-07, |
| "loss": 0.015, |
| "reward": 0.39941407908918336, |
| "reward_std": 0.4160995290149003, |
| "rewards/end_of_conversation_reward_func": 0.07031250058207661, |
| "rewards/end_rm_reward_func": 0.3291015625, |
| "rewards/length_reward_func": 0.0, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 91 |
| }, |
| { |
| "clip_ratio": 0.03050083527341485, |
| "epoch": 0.08598130841121496, |
| "grad_norm": 1.0383665896788337, |
| "kl": 6.238983154296875, |
| "learning_rate": 5e-07, |
| "loss": 0.0156, |
| "step": 92 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1954.2578125, |
| "epoch": 0.08691588785046729, |
| "grad_norm": 2.2687355754428173, |
| "kl": 65.58651733398438, |
| "learning_rate": 5e-07, |
| "loss": 0.0319, |
| "reward": 0.3478515758179128, |
| "reward_std": 0.3058149954304099, |
| "rewards/end_of_conversation_reward_func": 0.06953125057043508, |
| "rewards/end_rm_reward_func": 0.2783203125, |
| "rewards/length_reward_func": 0.0, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 93 |
| }, |
| { |
| "clip_ratio": 0.028172997990623116, |
| "epoch": 0.08785046728971962, |
| "grad_norm": 1.6402257084650889, |
| "kl": 92.76519775390625, |
| "learning_rate": 5e-07, |
| "loss": 0.032, |
| "step": 94 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 2004.828125, |
| "epoch": 0.08878504672897196, |
| "grad_norm": 1.1415388584450838, |
| "kl": 2546.2605895996094, |
| "learning_rate": 5e-07, |
| "loss": 0.0237, |
| "reward": 0.37500001094304025, |
| "reward_std": 0.28348651831038296, |
| "rewards/end_of_conversation_reward_func": 0.07812500034924597, |
| "rewards/end_rm_reward_func": 0.30078125, |
| "rewards/length_reward_func": -0.00390625, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 95 |
| }, |
| { |
| "clip_ratio": 0.02549815981183201, |
| "epoch": 0.08971962616822429, |
| "grad_norm": 1.081004230814789, |
| "kl": 395.8102111816406, |
| "learning_rate": 5e-07, |
| "loss": 0.0243, |
| "step": 96 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1971.4375, |
| "epoch": 0.09065420560747664, |
| "grad_norm": 1.055902509166769, |
| "kl": 15.49505615234375, |
| "learning_rate": 5e-07, |
| "loss": 0.0335, |
| "reward": 0.40781250852160156, |
| "reward_std": 0.3356896792538464, |
| "rewards/end_of_conversation_reward_func": 0.06796875118743628, |
| "rewards/end_rm_reward_func": 0.33984375, |
| "rewards/length_reward_func": 0.0, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 97 |
| }, |
| { |
| "clip_ratio": 0.02853394311387092, |
| "epoch": 0.09158878504672897, |
| "grad_norm": 1.023788947785838, |
| "kl": 13.60675048828125, |
| "learning_rate": 5e-07, |
| "loss": 0.034, |
| "step": 98 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 2164.7109375, |
| "epoch": 0.09252336448598131, |
| "grad_norm": 1.5344813793778085, |
| "kl": 1824.4334716796875, |
| "learning_rate": 5e-07, |
| "loss": 0.0159, |
| "reward": 0.37500000663567334, |
| "reward_std": 0.32311960216611624, |
| "rewards/end_of_conversation_reward_func": 0.06640625116415322, |
| "rewards/end_rm_reward_func": 0.30859375, |
| "rewards/length_reward_func": 0.0, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 99 |
| }, |
| { |
| "clip_ratio": 0.02664117852691561, |
| "epoch": 0.09345794392523364, |
| "grad_norm": 1.0889813227388263, |
| "kl": 1824.4682312011719, |
| "learning_rate": 5e-07, |
| "loss": 0.0168, |
| "step": 100 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 2111.9609375, |
| "epoch": 0.09439252336448598, |
| "grad_norm": 0.9625091435045346, |
| "kl": 2.5318603515625, |
| "learning_rate": 5e-07, |
| "loss": 0.0209, |
| "reward": 0.33808594394940883, |
| "reward_std": 0.3546365643851459, |
| "rewards/end_of_conversation_reward_func": 0.04609375057043508, |
| "rewards/end_rm_reward_func": 0.2958984375, |
| "rewards/length_reward_func": -0.00390625, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 101 |
| }, |
| { |
| "clip_ratio": 0.02783052495215088, |
| "epoch": 0.09532710280373832, |
| "grad_norm": 0.8974071545776331, |
| "kl": 10.89996337890625, |
| "learning_rate": 5e-07, |
| "loss": 0.0217, |
| "step": 102 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1706.078125, |
| "epoch": 0.09626168224299066, |
| "grad_norm": 1.0942238826222048, |
| "kl": 2.6583251953125, |
| "learning_rate": 5e-07, |
| "loss": 0.0271, |
| "reward": 0.42695313412696123, |
| "reward_std": 0.3482397049665451, |
| "rewards/end_of_conversation_reward_func": 0.07734375109430403, |
| "rewards/end_rm_reward_func": 0.349609375, |
| "rewards/length_reward_func": 0.0, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 103 |
| }, |
| { |
| "clip_ratio": 0.02592139912303537, |
| "epoch": 0.09719626168224299, |
| "grad_norm": 0.9820704192096017, |
| "kl": 5.14837646484375, |
| "learning_rate": 5e-07, |
| "loss": 0.0277, |
| "step": 104 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1888.265625, |
| "epoch": 0.09813084112149532, |
| "grad_norm": 1.101428681790454, |
| "kl": 1.542755126953125, |
| "learning_rate": 5e-07, |
| "loss": 0.0239, |
| "reward": 0.26035156939178705, |
| "reward_std": 0.2557795748580247, |
| "rewards/end_of_conversation_reward_func": 0.052343750663567334, |
| "rewards/end_rm_reward_func": 0.2080078125, |
| "rewards/length_reward_func": 0.0, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 105 |
| }, |
| { |
| "clip_ratio": 0.028359673800878227, |
| "epoch": 0.09906542056074766, |
| "grad_norm": 1.0325324907073261, |
| "kl": 1.659881591796875, |
| "learning_rate": 5e-07, |
| "loss": 0.0243, |
| "step": 106 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1896.203125, |
| "epoch": 0.1, |
| "grad_norm": 1.2162479513962308, |
| "kl": 2.1485595703125, |
| "learning_rate": 5e-07, |
| "loss": 0.0064, |
| "reward": 0.4125000135973096, |
| "reward_std": 0.38071509543806314, |
| "rewards/end_of_conversation_reward_func": 0.08046875102445483, |
| "rewards/end_rm_reward_func": 0.33203125, |
| "rewards/length_reward_func": 0.0, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 107 |
| }, |
| { |
| "clip_ratio": 0.03139533591456711, |
| "epoch": 0.10093457943925234, |
| "grad_norm": 1.080434258639653, |
| "kl": 1.02691650390625, |
| "learning_rate": 5e-07, |
| "loss": 0.0069, |
| "step": 108 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1806.3125, |
| "epoch": 0.10186915887850467, |
| "grad_norm": 1.1593632179482682, |
| "kl": 2.096435546875, |
| "learning_rate": 5e-07, |
| "loss": 0.0132, |
| "reward": 0.3439453151077032, |
| "reward_std": 0.252871933626011, |
| "rewards/end_of_conversation_reward_func": 0.05781250086147338, |
| "rewards/end_rm_reward_func": 0.2861328125, |
| "rewards/length_reward_func": 0.0, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 109 |
| }, |
| { |
| "clip_ratio": 0.024627559236250818, |
| "epoch": 0.102803738317757, |
| "grad_norm": 1.0771076027292037, |
| "kl": 3.48040771484375, |
| "learning_rate": 5e-07, |
| "loss": 0.0135, |
| "step": 110 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1964.296875, |
| "epoch": 0.10373831775700934, |
| "grad_norm": 1.0171895687624364, |
| "kl": 240.58120727539062, |
| "learning_rate": 5e-07, |
| "loss": 0.0527, |
| "reward": 0.44511719804722816, |
| "reward_std": 0.3740708865225315, |
| "rewards/end_of_conversation_reward_func": 0.06718750117579475, |
| "rewards/end_rm_reward_func": 0.3779296875, |
| "rewards/length_reward_func": 0.0, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 111 |
| }, |
| { |
| "clip_ratio": 0.028232302283868194, |
| "epoch": 0.10467289719626169, |
| "grad_norm": 0.9421146301422737, |
| "kl": 208.3177490234375, |
| "learning_rate": 5e-07, |
| "loss": 0.0534, |
| "step": 112 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 2157.8359375, |
| "epoch": 0.10560747663551402, |
| "grad_norm": 1641.5392722026472, |
| "kl": 236.9107666015625, |
| "learning_rate": 5e-07, |
| "loss": 0.0971, |
| "reward": 0.33730469457805157, |
| "reward_std": 0.3461699963081628, |
| "rewards/end_of_conversation_reward_func": 0.06093750044237822, |
| "rewards/end_rm_reward_func": 0.2763671875, |
| "rewards/length_reward_func": 0.0, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 113 |
| }, |
| { |
| "clip_ratio": 0.029742747312411666, |
| "epoch": 0.10654205607476636, |
| "grad_norm": 2650.3812295480748, |
| "kl": 11.709716796875, |
| "learning_rate": 5e-07, |
| "loss": 0.2283, |
| "step": 114 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 2026.0546875, |
| "epoch": 0.10747663551401869, |
| "grad_norm": 1.1431858031367719, |
| "kl": 98.2745361328125, |
| "learning_rate": 5e-07, |
| "loss": 0.0166, |
| "reward": 0.381835951237008, |
| "reward_std": 0.3533117617480457, |
| "rewards/end_of_conversation_reward_func": 0.06250000040745363, |
| "rewards/end_rm_reward_func": 0.3193359375, |
| "rewards/length_reward_func": 0.0, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 115 |
| }, |
| { |
| "clip_ratio": 0.029958419385366142, |
| "epoch": 0.10841121495327102, |
| "grad_norm": 1.029379609705644, |
| "kl": 42.82733154296875, |
| "learning_rate": 5e-07, |
| "loss": 0.0172, |
| "step": 116 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1906.421875, |
| "epoch": 0.10934579439252337, |
| "grad_norm": 1.3109647944857514, |
| "kl": 2.085693359375, |
| "learning_rate": 5e-07, |
| "loss": 0.0308, |
| "reward": 0.3277343953959644, |
| "reward_std": 0.3133402925450355, |
| "rewards/end_of_conversation_reward_func": 0.07578125037252903, |
| "rewards/end_rm_reward_func": 0.251953125, |
| "rewards/length_reward_func": 0.0, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 117 |
| }, |
| { |
| "clip_ratio": 0.028814686927944422, |
| "epoch": 0.1102803738317757, |
| "grad_norm": 1.1262854576591834, |
| "kl": 1.031341552734375, |
| "learning_rate": 5e-07, |
| "loss": 0.0312, |
| "step": 118 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 2316.7578125, |
| "epoch": 0.11121495327102804, |
| "grad_norm": 1.0414340605792203, |
| "kl": 339.03094482421875, |
| "learning_rate": 5e-07, |
| "loss": 0.0345, |
| "reward": 0.3103515736875124, |
| "reward_std": 0.32673182454891503, |
| "rewards/end_of_conversation_reward_func": 0.06718750071013346, |
| "rewards/end_rm_reward_func": 0.2431640625, |
| "rewards/length_reward_func": 0.0, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 119 |
| }, |
| { |
| "clip_ratio": 0.028005447005853057, |
| "epoch": 0.11214953271028037, |
| "grad_norm": 0.962355400088625, |
| "kl": 885.1737060546875, |
| "learning_rate": 5e-07, |
| "loss": 0.0352, |
| "step": 120 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1997.1796875, |
| "epoch": 0.1130841121495327, |
| "grad_norm": 1.2316757316354439, |
| "kl": 15193.576782226562, |
| "learning_rate": 5e-07, |
| "loss": 0.0502, |
| "reward": 0.3468750179745257, |
| "reward_std": 0.32240671874023974, |
| "rewards/end_of_conversation_reward_func": 0.07343750074505806, |
| "rewards/end_rm_reward_func": 0.2734375, |
| "rewards/length_reward_func": 0.0, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 121 |
| }, |
| { |
| "clip_ratio": 0.0309814119245857, |
| "epoch": 0.11401869158878504, |
| "grad_norm": 1.141627400565903, |
| "kl": 305263.65728759766, |
| "learning_rate": 5e-07, |
| "loss": 0.0508, |
| "step": 122 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1961.7734375, |
| "epoch": 0.11495327102803739, |
| "grad_norm": 1.2524833933172261, |
| "kl": 15.526611328125, |
| "learning_rate": 5e-07, |
| "loss": 0.0052, |
| "reward": 0.38671876676380634, |
| "reward_std": 0.3171714274212718, |
| "rewards/end_of_conversation_reward_func": 0.07812500069849193, |
| "rewards/end_rm_reward_func": 0.30859375, |
| "rewards/length_reward_func": 0.0, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 123 |
| }, |
| { |
| "clip_ratio": 0.030462613562121987, |
| "epoch": 0.11588785046728972, |
| "grad_norm": 1.1092860205417427, |
| "kl": 258.77789306640625, |
| "learning_rate": 5e-07, |
| "loss": 0.0053, |
| "step": 124 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 2030.90625, |
| "epoch": 0.11682242990654206, |
| "grad_norm": 1.5604290924244164, |
| "kl": 37.70721435546875, |
| "learning_rate": 5e-07, |
| "loss": 0.0311, |
| "reward": 0.48281251545995474, |
| "reward_std": 0.2945442160125822, |
| "rewards/end_of_conversation_reward_func": 0.07656250055879354, |
| "rewards/end_rm_reward_func": 0.40625, |
| "rewards/length_reward_func": 0.0, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 125 |
| }, |
| { |
| "clip_ratio": 0.03006787970662117, |
| "epoch": 0.11775700934579439, |
| "grad_norm": 1.7413118821455535, |
| "kl": 23.40301513671875, |
| "learning_rate": 5e-07, |
| "loss": 0.0318, |
| "step": 126 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 2265.2421875, |
| "epoch": 0.11869158878504672, |
| "grad_norm": 1.212567517072936, |
| "kl": 42.730712890625, |
| "learning_rate": 5e-07, |
| "loss": 0.0065, |
| "reward": 0.4443359524011612, |
| "reward_std": 0.392240944551304, |
| "rewards/end_of_conversation_reward_func": 0.06250000098953024, |
| "rewards/end_rm_reward_func": 0.3818359375, |
| "rewards/length_reward_func": 0.0, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 127 |
| }, |
| { |
| "clip_ratio": 0.02885347604751587, |
| "epoch": 0.11962616822429907, |
| "grad_norm": 1.1199940504331805, |
| "kl": 13.65277099609375, |
| "learning_rate": 5e-07, |
| "loss": 0.0069, |
| "step": 128 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1686.7734375, |
| "epoch": 0.1205607476635514, |
| "grad_norm": 1.2282859501226184, |
| "kl": 2.62017822265625, |
| "learning_rate": 5e-07, |
| "loss": -0.0218, |
| "reward": 0.46074219583533704, |
| "reward_std": 0.32941993116401136, |
| "rewards/end_of_conversation_reward_func": 0.059375000826548785, |
| "rewards/end_rm_reward_func": 0.4013671875, |
| "rewards/length_reward_func": 0.0, |
| "rewards/thinking_reward_func": 0.0, |
| "step": 129 |
| }, |
| { |
| "clip_ratio": 0.029977424652315676, |
| "epoch": 0.12149532710280374, |
| "grad_norm": 1.201467543453189, |
| "kl": 4.4796142578125, |
| "learning_rate": 5e-07, |
| "loss": -0.0214, |
| "step": 130 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 1070, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 10, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 32, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|