{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.00022, "eval_steps": 500, "global_step": 11, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1838.0, "completions/max_terminated_length": 1838.0, "completions/mean_length": 1617.0625, "completions/mean_terminated_length": 1617.0625, "completions/min_length": 1177.0, "completions/min_terminated_length": 1177.0, "entropy": 0.5484257489442825, "epoch": 2e-05, "frac_reward_zero_std": 0.0, "grad_norm": 1.9166723489761353, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0845, "num_tokens": 75257.0, "reward": -0.5254741907119751, "reward_std": 0.1556793451309204, "rewards/rollout_reward_func/mean": -0.5254741907119751, "rewards/rollout_reward_func/std": 0.19436030089855194, "sampling/importance_sampling_ratio/max": 1.6091933250427246, "sampling/importance_sampling_ratio/mean": 1.009643793106079, "sampling/importance_sampling_ratio/min": 0.7761501669883728, "sampling/sampling_logp_difference/max": 0.2959944009780884, "sampling/sampling_logp_difference/mean": 0.02158474177122116, "step": 1, "step_time": 39.089814388993545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5484257489442825, "epoch": 4e-05, "grad_norm": 1.95936918258667, "kl": 0.0, "learning_rate": 2.8571428571428575e-07, "loss": -0.0845, "step": 2, "step_time": 6.1725359009979 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0016025641234591603, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0035556891234591603, "completions/clipped_ratio": 0.0, "completions/max_length": 1872.0, "completions/max_terminated_length": 1872.0, "completions/mean_length": 1648.46875, "completions/mean_terminated_length": 1648.46875, "completions/min_length": 1152.0, "completions/min_terminated_length": 1152.0, "entropy": 0.5180552825331688, "epoch": 6e-05, "frac_reward_zero_std": 0.0, "grad_norm": 2.2347609996795654, "kl": 0.0011785991373471916, "learning_rate": 5.714285714285715e-07, "loss": 0.1137, "num_tokens": 151661.0, "reward": -0.6038702726364136, "reward_std": 0.22588981688022614, "rewards/rollout_reward_func/mean": -0.6038702726364136, "rewards/rollout_reward_func/std": 0.2508644163608551, "sampling/importance_sampling_ratio/max": 1.4536101818084717, "sampling/importance_sampling_ratio/mean": 1.0416083335876465, "sampling/importance_sampling_ratio/min": 0.705021858215332, "sampling/sampling_logp_difference/max": 0.31142354011535645, "sampling/sampling_logp_difference/mean": 0.02324344590306282, "step": 3, "step_time": 33.94999921999988 }, { "clip_ratio/high_max": 0.007195723708719015, "clip_ratio/high_mean": 0.0035978618543595076, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0035978618543595076, "entropy": 0.5177513211965561, "epoch": 8e-05, "grad_norm": 2.218416929244995, "kl": 0.0013052129361312836, "learning_rate": 8.571428571428572e-07, "loss": 0.1144, "step": 4, "step_time": 6.260843116993783 }, { "clip_ratio/high_max": 0.0036764706019312143, "clip_ratio/high_mean": 0.0018382353009656072, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0018382353009656072, "completions/clipped_ratio": 0.0, "completions/max_length": 1892.0, "completions/max_terminated_length": 1892.0, "completions/mean_length": 1607.125, "completions/mean_terminated_length": 1607.125, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "entropy": 0.5215674266219139, "epoch": 0.0001, "frac_reward_zero_std": 0.0, "grad_norm": 2.6341018676757812, "kl": 0.0011459336819825694, "learning_rate": 1.142857142857143e-06, "loss": -0.0192, "num_tokens": 226943.0, "reward": -0.6061195731163025, "reward_std": 0.2195071578025818, "rewards/rollout_reward_func/mean": -0.6061195731163025, "rewards/rollout_reward_func/std": 0.23115375638008118, "sampling/importance_sampling_ratio/max": 2.042933940887451, "sampling/importance_sampling_ratio/mean": 0.9376565217971802, "sampling/importance_sampling_ratio/min": 3.4871042007720773e-14, "sampling/sampling_logp_difference/max": 28.74315643310547, "sampling/sampling_logp_difference/mean": 0.14062589406967163, "step": 5, "step_time": 31.375892003994522 }, { "clip_ratio/high_max": 0.007352941203862429, "clip_ratio/high_mean": 0.0036764706019312143, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0036764706019312143, "entropy": 0.5232524983584881, "epoch": 0.00012, "grad_norm": 2.3644168376922607, "kl": 0.0010918559346464463, "learning_rate": 1.4285714285714286e-06, "loss": -0.0195, "step": 6, "step_time": 6.260178877000726 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0016025641234591603, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0035556891234591603, "completions/clipped_ratio": 0.0, "completions/max_length": 1890.0, "completions/max_terminated_length": 1890.0, "completions/mean_length": 1565.90625, "completions/mean_terminated_length": 1565.90625, "completions/min_length": 492.0, "completions/min_terminated_length": 492.0, "entropy": 0.48773781582713127, "epoch": 0.00014, "frac_reward_zero_std": 0.0, "grad_norm": 1.9083935022354126, "kl": 0.0010215075744781643, "learning_rate": 1.7142857142857145e-06, "loss": 0.0341, "num_tokens": 300553.0, "reward": -0.6151305437088013, "reward_std": 0.23046068847179413, "rewards/rollout_reward_func/mean": -0.6151305437088013, "rewards/rollout_reward_func/std": 0.24246515333652496, "sampling/importance_sampling_ratio/max": 1.3417084217071533, "sampling/importance_sampling_ratio/mean": 0.945120632648468, "sampling/importance_sampling_ratio/min": 2.504423404769973e-11, "sampling/sampling_logp_difference/max": 24.4301700592041, "sampling/sampling_logp_difference/mean": 0.06543836742639542, "step": 7, "step_time": 30.44348550399809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.003388278419151902, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003388278419151902, "entropy": 0.48545313626527786, "epoch": 0.00016, "grad_norm": 1.8554192781448364, "kl": 0.0009212387376464903, "learning_rate": 2.0000000000000003e-06, "loss": 0.0347, "step": 8, "step_time": 6.283500026991533 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.00533150346018374, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00728462846018374, "completions/clipped_ratio": 0.0, "completions/max_length": 1925.0, "completions/max_terminated_length": 1925.0, "completions/mean_length": 1673.8125, "completions/mean_terminated_length": 1673.8125, "completions/min_length": 1305.0, "completions/min_terminated_length": 1305.0, "entropy": 0.5548458024859428, "epoch": 0.00018, "frac_reward_zero_std": 0.0, "grad_norm": 1.4972906112670898, "kl": 0.001124227695981972, "learning_rate": 2.285714285714286e-06, "loss": -0.0231, "num_tokens": 377565.0, "reward": -0.606047511100769, "reward_std": 0.17791865766048431, "rewards/rollout_reward_func/mean": -0.606047511100769, "rewards/rollout_reward_func/std": 0.21615628898143768, "sampling/importance_sampling_ratio/max": 1.6678369045257568, "sampling/importance_sampling_ratio/mean": 0.9152076244354248, "sampling/importance_sampling_ratio/min": 4.311538348567012e-10, "sampling/sampling_logp_difference/max": 20.632226943969727, "sampling/sampling_logp_difference/mean": 0.06397730857133865, "step": 9, "step_time": 41.25972091700169 }, { "clip_ratio/high_max": 0.00390625, "clip_ratio/high_mean": 0.001953125, "clip_ratio/low_mean": 0.0016891892300918698, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00364231423009187, "entropy": 0.5546580441296101, "epoch": 0.0002, "grad_norm": 1.610571265220642, "kl": 0.001391250429151114, "learning_rate": 2.571428571428571e-06, "loss": -0.0255, "step": 10, "step_time": 7.209359069009224 }, { "clip_ratio/high_max": 0.009114583488553762, "clip_ratio/high_mean": 0.004557291744276881, "clip_ratio/low_mean": 0.0022321429569274187, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006789434468373656, "completions/clipped_ratio": 0.0, "completions/max_length": 1808.0, "completions/max_terminated_length": 1808.0, "completions/mean_length": 1554.34375, "completions/mean_terminated_length": 1554.34375, "completions/min_length": 700.0, "completions/min_terminated_length": 700.0, "entropy": 0.4812197834253311, "epoch": 0.00022, "frac_reward_zero_std": 0.0, "grad_norm": 1.6533305644989014, "kl": 0.0016009878600016236, "learning_rate": 2.8571428571428573e-06, "loss": 0.0432, "num_tokens": 451126.0, "reward": -0.5563069581985474, "reward_std": 0.29338371753692627, "rewards/rollout_reward_func/mean": -0.5563069581985474, "rewards/rollout_reward_func/std": 0.3663959503173828, "sampling/importance_sampling_ratio/max": 1.6875137090682983, "sampling/importance_sampling_ratio/mean": 1.0345326662063599, "sampling/importance_sampling_ratio/min": 1.5839532376688004e-13, "sampling/sampling_logp_difference/max": 28.526655197143555, "sampling/sampling_logp_difference/mean": 0.07865896075963974, "step": 11, "step_time": 34.2954730700003 } ], "logging_steps": 1.0, "max_steps": 350000, "num_input_tokens_seen": 451126, "num_train_epochs": 7, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }