{ "best_metric": 0.6185767650604248, "best_model_checkpoint": "/storage/trained_grpo_distill_14b/v42-20250317-212613/checkpoint-2", "epoch": 1.784313725490196, "eval_steps": 2, "global_step": 22, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 6417.984375, "epoch": 0.0784313725490196, "grad_norm": 0.002763764699921012, "kl": 0.0, "learning_rate": 1.6666666666666667e-05, "loss": 0.0836174339056015, "memory(GiB)": 184.97, "response_clip_ratio": 0.12890625, "reward": 0.4296856001019478, "reward_std": 0.14499560371041298, "rewards/CosineReward": 0.4296856001019478, "step": 1, "train_speed(iter/s)": 0.000129 }, { "epoch": 0.1568627450980392, "grad_norm": 0.0027679470367729664, "learning_rate": 3.3333333333333335e-05, "loss": 0.0836174339056015, "memory(GiB)": 184.97, "step": 2, "train_speed(iter/s)": 0.000249 }, { "epoch": 0.1568627450980392, "eval_clip_ratio": 0.00024764647241681814, "eval_completion_length": 5048.4375, "eval_kl": 0.0001354217529296875, "eval_loss": 0.0001584067940711975, "eval_response_clip_ratio": 0.0625, "eval_reward": 0.6185767650604248, "eval_reward_std": 0.12649827264249325, "eval_rewards/CosineReward": 0.6185767650604248, "eval_runtime": 1074.328, "eval_samples_per_second": 0.007, "eval_steps_per_second": 0.001, "step": 2 }, { "clip_ratio": 0.0001064514563040575, "completion_length": 6496.7421875, "epoch": 0.23529411764705882, "grad_norm": 0.0026931765023618937, "kl": 6.115436553955078e-05, "learning_rate": 5e-05, "loss": 0.08872900903224945, "memory(GiB)": 184.97, "response_clip_ratio": 0.15625, "reward": 0.4729522615671158, "reward_std": 0.11709235422313213, "rewards/CosineReward": 0.4729522615671158, "step": 3, "train_speed(iter/s)": 0.000176 }, { "epoch": 0.3137254901960784, "grad_norm": 0.002692870795726776, "learning_rate": 6.666666666666667e-05, "loss": 0.08870330452919006, "memory(GiB)": 184.97, "step": 4, "train_speed(iter/s)": 0.00023 }, { "epoch": 0.3137254901960784, "eval_clip_ratio": 0.0002070416376227513, "eval_completion_length": 5490.0, "eval_kl": 0.00012302398681640625, "eval_loss": 1.0617077350616455e-05, "eval_response_clip_ratio": 0.0, "eval_reward": 0.5434095412492752, "eval_reward_std": 0.1754817459732294, "eval_rewards/CosineReward": 0.5434095412492752, "eval_runtime": 1342.619, "eval_samples_per_second": 0.006, "eval_steps_per_second": 0.001, "step": 4 }, { "clip_ratio": 0.00019944452469644602, "completion_length": 6302.51953125, "epoch": 0.39215686274509803, "grad_norm": 0.003180823987349868, "kl": 0.00012123584747314453, "learning_rate": 8.333333333333334e-05, "loss": 0.08817961066961288, "memory(GiB)": 184.97, "response_clip_ratio": 0.1171875, "reward": 0.480031818151474, "reward_std": 0.12067002058029175, "rewards/CosineReward": 0.480031818151474, "step": 5, "train_speed(iter/s)": 0.000188 }, { "epoch": 0.47058823529411764, "grad_norm": 0.003291113767772913, "learning_rate": 0.0001, "loss": 0.08811932057142258, "memory(GiB)": 184.97, "step": 6, "train_speed(iter/s)": 0.000224 }, { "epoch": 0.47058823529411764, "eval_clip_ratio": 0.00020231583039276302, "eval_completion_length": 5615.6875, "eval_kl": 0.00018548965454101562, "eval_loss": 0.00017052143812179565, "eval_response_clip_ratio": 0.125, "eval_reward": 0.4852888882160187, "eval_reward_std": 0.14715369045734406, "eval_rewards/CosineReward": 0.4852888882160187, "eval_runtime": 1364.1653, "eval_samples_per_second": 0.006, "eval_steps_per_second": 0.001, "step": 6 }, { "clip_ratio": 0.0002164878969779238, "completion_length": 6660.578125, "epoch": 0.5490196078431373, "grad_norm": 0.003197300247848034, "kl": 0.00015372037887573242, "learning_rate": 9.991540791356342e-05, "loss": 0.086369089782238, "memory(GiB)": 184.97, "response_clip_ratio": 0.13671875, "reward": 0.43960579484701157, "reward_std": 0.12519060634076595, "rewards/CosineReward": 0.43960579484701157, "step": 7, "train_speed(iter/s)": 0.000194 }, { "epoch": 0.6274509803921569, "grad_norm": 0.0034806649200618267, "learning_rate": 9.966191788709716e-05, "loss": 0.08621430397033691, "memory(GiB)": 184.97, "step": 8, "train_speed(iter/s)": 0.00022 }, { "epoch": 0.6274509803921569, "eval_clip_ratio": 7.152849320846144e-05, "eval_completion_length": 5739.0625, "eval_kl": 0.001102447509765625, "eval_loss": 0.00010670721530914307, "eval_response_clip_ratio": 0.125, "eval_reward": 0.5124959498643875, "eval_reward_std": 0.21372128650546074, "eval_rewards/CosineReward": 0.5124959498643875, "eval_runtime": 1470.3009, "eval_samples_per_second": 0.005, "eval_steps_per_second": 0.001, "step": 8 }, { "clip_ratio": 0.0003287304334662622, "completion_length": 6587.18359375, "epoch": 0.7058823529411765, "grad_norm": 0.004174312110990286, "kl": 0.0007574558258056641, "learning_rate": 9.924038765061042e-05, "loss": 0.0881085991859436, "memory(GiB)": 184.97, "response_clip_ratio": 0.1953125, "reward": 0.4415409117937088, "reward_std": 0.10440743528306484, "rewards/CosineReward": 0.4415409117937088, "step": 9, "train_speed(iter/s)": 0.000196 }, { "epoch": 0.7843137254901961, "grad_norm": 0.003945177886635065, "learning_rate": 9.865224352899119e-05, "loss": 0.08792611956596375, "memory(GiB)": 184.97, "step": 10, "train_speed(iter/s)": 0.000217 }, { "epoch": 0.7843137254901961, "eval_clip_ratio": 0.00014116203965386376, "eval_completion_length": 6354.875, "eval_kl": 0.003173828125, "eval_loss": 3.191456198692322e-05, "eval_response_clip_ratio": 0.125, "eval_reward": 0.5644881427288055, "eval_reward_std": 0.11511430516839027, "eval_rewards/CosineReward": 0.5644881427288055, "eval_runtime": 1510.6041, "eval_samples_per_second": 0.005, "eval_steps_per_second": 0.001, "step": 10 }, { "clip_ratio": 0.0004681620275732712, "completion_length": 6903.640625, "epoch": 0.8627450980392157, "grad_norm": 0.00393508980050683, "kl": 0.002521514892578125, "learning_rate": 9.789947561577445e-05, "loss": 0.0897117406129837, "memory(GiB)": 184.97, "response_clip_ratio": 0.171875, "reward": 0.4390856474637985, "reward_std": 0.11577454954385757, "rewards/CosineReward": 0.4390856474637985, "step": 11, "train_speed(iter/s)": 0.000198 }, { "epoch": 0.9411764705882353, "grad_norm": 0.0034022172912955284, "learning_rate": 9.698463103929542e-05, "loss": 0.08947472274303436, "memory(GiB)": 184.97, "step": 12, "train_speed(iter/s)": 0.000215 }, { "epoch": 0.9411764705882353, "eval_clip_ratio": 7.602112054883037e-05, "eval_completion_length": 6181.5, "eval_kl": 0.00714111328125, "eval_loss": 0.00035206228494644165, "eval_response_clip_ratio": 0.1875, "eval_reward": 0.45696309208869934, "eval_reward_std": 0.04341535549610853, "eval_rewards/CosineReward": 0.45696309208869934, "eval_runtime": 1437.7604, "eval_samples_per_second": 0.006, "eval_steps_per_second": 0.001, "step": 12 }, { "clip_ratio": 0.0003813115013144852, "completion_length": 6900.296875, "epoch": 1.0784313725490196, "grad_norm": 0.013968409970402718, "kl": 0.0058441162109375, "learning_rate": 9.591080534401371e-05, "loss": 0.08859321475028992, "memory(GiB)": 184.97, "response_clip_ratio": 0.203125, "reward": 0.4131528064608574, "reward_std": 0.1398314479738474, "rewards/CosineReward": 0.4131528064608574, "step": 13, "train_speed(iter/s)": 0.000199 }, { "epoch": 1.156862745098039, "grad_norm": 0.0037056964356452227, "learning_rate": 9.468163201617062e-05, "loss": 0.08834587037563324, "memory(GiB)": 184.97, "step": 14, "train_speed(iter/s)": 0.000214 }, { "epoch": 1.156862745098039, "eval_clip_ratio": 0.00016801093079266138, "eval_completion_length": 6428.5, "eval_kl": 0.012603759765625, "eval_loss": 0.0005806386470794678, "eval_response_clip_ratio": 0.25, "eval_reward": 0.4538377672433853, "eval_reward_std": 0.1443052440881729, "eval_rewards/CosineReward": 0.4538377672433853, "eval_runtime": 1590.5873, "eval_samples_per_second": 0.005, "eval_steps_per_second": 0.001, "step": 14 }, { "clip_ratio": 0.00040143599017028464, "completion_length": 6398.2890625, "epoch": 1.2352941176470589, "grad_norm": 0.0041822106577456, "kl": 0.0112152099609375, "learning_rate": 9.330127018922194e-05, "loss": 0.09059401601552963, "memory(GiB)": 184.97, "response_clip_ratio": 0.1484375, "reward": 0.46379417926073074, "reward_std": 0.11776200495660305, "rewards/CosineReward": 0.46379417926073074, "step": 15, "train_speed(iter/s)": 0.0002 }, { "epoch": 1.3137254901960784, "grad_norm": 0.0046047731302678585, "learning_rate": 9.177439057064683e-05, "loss": 0.09031489491462708, "memory(GiB)": 184.97, "step": 16, "train_speed(iter/s)": 0.000212 }, { "epoch": 1.3137254901960784, "eval_clip_ratio": 5.4339221605914645e-05, "eval_completion_length": 5552.0625, "eval_kl": 0.01727294921875, "eval_loss": 0.0005940012633800507, "eval_response_clip_ratio": 0.0625, "eval_reward": 0.506618320941925, "eval_reward_std": 0.21169210970401764, "eval_rewards/CosineReward": 0.506618320941925, "eval_runtime": 1518.0802, "eval_samples_per_second": 0.005, "eval_steps_per_second": 0.001, "step": 16 }, { "clip_ratio": 6.063861019356409e-05, "completion_length": 6686.59765625, "epoch": 1.392156862745098, "grad_norm": 0.004301061388105154, "kl": 0.0179443359375, "learning_rate": 9.01061596377522e-05, "loss": 0.07838998734951019, "memory(GiB)": 184.28, "response_clip_ratio": 0.18359375, "reward": 0.4415987730026245, "reward_std": 0.09190415777266026, "rewards/CosineReward": 0.4415987730026245, "step": 17, "train_speed(iter/s)": 0.002176 }, { "epoch": 1.4705882352941178, "grad_norm": 0.004479076247662306, "learning_rate": 8.83022221559489e-05, "loss": 0.07815618067979813, "memory(GiB)": 184.28, "step": 18, "train_speed(iter/s)": 0.002225 }, { "epoch": 1.4705882352941178, "eval_clip_ratio": 9.80815566435922e-05, "eval_completion_length": 6907.125, "eval_kl": 0.02203369140625, "eval_loss": 0.0009158775210380554, "eval_response_clip_ratio": 0.1875, "eval_reward": 0.4723748713731766, "eval_reward_std": 0.3155394047498703, "eval_rewards/CosineReward": 0.4723748713731766, "eval_runtime": 1457.6003, "eval_samples_per_second": 0.005, "eval_steps_per_second": 0.001, "step": 18 }, { "clip_ratio": 0.00023432340185536304, "completion_length": 7038.578125, "epoch": 1.5490196078431373, "grad_norm": 0.005921878386288881, "kl": 0.0222320556640625, "learning_rate": 8.636868207865244e-05, "loss": 0.0942603051662445, "memory(GiB)": 184.28, "response_clip_ratio": 0.19921875, "reward": 0.42305945605039597, "reward_std": 0.17355207353830338, "rewards/CosineReward": 0.42305945605039597, "step": 19, "train_speed(iter/s)": 0.001092 }, { "epoch": 1.6274509803921569, "grad_norm": 0.006151494570076466, "learning_rate": 8.43120818934367e-05, "loss": 0.09388023614883423, "memory(GiB)": 184.28, "step": 20, "train_speed(iter/s)": 0.001131 }, { "epoch": 1.6274509803921569, "eval_clip_ratio": 6.59385500512144e-05, "eval_completion_length": 7039.0625, "eval_kl": 0.02911376953125, "eval_loss": 0.0012894980609416962, "eval_response_clip_ratio": 0.1875, "eval_reward": 0.4684200882911682, "eval_reward_std": 0.3149525970220566, "eval_rewards/CosineReward": 0.4684200882911682, "eval_runtime": 1460.114, "eval_samples_per_second": 0.005, "eval_steps_per_second": 0.001, "step": 20 }, { "clip_ratio": 0.00028659221607085783, "completion_length": 7469.6796875, "epoch": 1.7058823529411766, "grad_norm": 0.006666958797723055, "kl": 0.0279693603515625, "learning_rate": 8.213938048432697e-05, "loss": 0.075472891330719, "memory(GiB)": 184.28, "response_clip_ratio": 0.29296875, "reward": 0.3378320038318634, "reward_std": 0.1059294268488884, "rewards/CosineReward": 0.3378320038318634, "step": 21, "train_speed(iter/s)": 0.000775 }, { "epoch": 1.784313725490196, "grad_norm": 0.0069817290641367435, "learning_rate": 7.985792958513931e-05, "loss": 0.07505190372467041, "memory(GiB)": 184.28, "step": 22, "train_speed(iter/s)": 0.000803 }, { "epoch": 1.784313725490196, "eval_clip_ratio": 3.5660988942254335e-05, "eval_completion_length": 7378.4375, "eval_kl": 0.0372314453125, "eval_loss": 0.0015567503869533539, "eval_response_clip_ratio": 0.1875, "eval_reward": 0.4472413510084152, "eval_reward_std": 0.311752051115036, "eval_rewards/CosineReward": 0.4472413510084152, "eval_runtime": 1469.5869, "eval_samples_per_second": 0.005, "eval_steps_per_second": 0.001, "step": 22 } ], "logging_steps": 1, "max_steps": 60, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 2, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }