{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.00026, "eval_steps": 500, "global_step": 13, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4368.0, "completions/max_terminated_length": 4368.0, "completions/mean_length": 3872.46875, "completions/mean_terminated_length": 3872.46875, "completions/min_length": 1039.0, "completions/min_terminated_length": 1039.0, "entropy": 0.5444511137902737, "epoch": 2e-05, "frac_reward_zero_std": 0.0, "grad_norm": 3.4791297912597656, "kl": 0.0, "learning_rate": 0.0, "loss": 0.0766, "num_tokens": 147182.0, "reward": -2.4586520195007324, "reward_std": 1.220275640487671, "rewards/rollout_reward_func/mean": -2.4586520195007324, "rewards/rollout_reward_func/std": 1.256102204322815, "sampling/importance_sampling_ratio/max": 1.6867375373840332, "sampling/importance_sampling_ratio/mean": 0.9737950563430786, "sampling/importance_sampling_ratio/min": 0.27162742614746094, "sampling/sampling_logp_difference/max": 0.7351722717285156, "sampling/sampling_logp_difference/mean": 0.02089117467403412, "step": 1, "step_time": 57.306324382999264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5444511137902737, "epoch": 4e-05, "grad_norm": 3.456141710281372, "kl": 0.0, "learning_rate": 2.8571428571428575e-07, "loss": 0.0766, "step": 2, "step_time": 11.957301280001047 }, { "clip_ratio/high_max": 0.0016891892300918698, "clip_ratio/high_mean": 0.0008445946150459349, "clip_ratio/low_mean": 0.0026041667442768812, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003448761359322816, "completions/clipped_ratio": 0.0, "completions/max_length": 4214.0, "completions/max_terminated_length": 4214.0, "completions/mean_length": 3857.65625, "completions/mean_terminated_length": 3857.65625, "completions/min_length": 2976.0, "completions/min_terminated_length": 2976.0, "entropy": 0.5738473497331142, "epoch": 6e-05, "frac_reward_zero_std": 0.0, "grad_norm": 2.3130648136138916, "kl": 0.0009115237262449227, "learning_rate": 5.714285714285715e-07, "loss": 0.0672, "num_tokens": 294237.0, "reward": -2.499704122543335, "reward_std": 1.0055248737335205, "rewards/rollout_reward_func/mean": -2.499704122543335, "rewards/rollout_reward_func/std": 1.122406005859375, "sampling/importance_sampling_ratio/max": 1.5998320579528809, "sampling/importance_sampling_ratio/mean": 0.9918304085731506, "sampling/importance_sampling_ratio/min": 0.6382073163986206, "sampling/sampling_logp_difference/max": 0.33383655548095703, "sampling/sampling_logp_difference/mean": 0.021819621324539185, "step": 3, "step_time": 55.93842194800072 }, { "clip_ratio/high_max": 0.004406580585055053, "clip_ratio/high_mean": 0.0022032902925275266, "clip_ratio/low_mean": 0.005063657532446086, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007266947708558291, "entropy": 0.5751528590917587, "epoch": 8e-05, "grad_norm": 2.254608631134033, "kl": 0.0011329303233651444, "learning_rate": 8.571428571428572e-07, "loss": 0.065, "step": 4, "step_time": 11.812342160998924 }, { "clip_ratio/high_max": 0.007812500232830644, "clip_ratio/high_mean": 0.003906250116415322, "clip_ratio/low_mean": 0.0026041667442768812, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006510416860692203, "completions/clipped_ratio": 0.0, "completions/max_length": 4288.0, "completions/max_terminated_length": 4288.0, "completions/mean_length": 3989.28125, "completions/mean_terminated_length": 3989.28125, "completions/min_length": 2506.0, "completions/min_terminated_length": 2506.0, "entropy": 0.5535362251102924, "epoch": 0.0001, "frac_reward_zero_std": 0.0, "grad_norm": 2.2777466773986816, "kl": 0.0011825040710391477, "learning_rate": 1.142857142857143e-06, "loss": 0.0929, "num_tokens": 445258.0, "reward": -2.45902681350708, "reward_std": 0.6927163600921631, "rewards/rollout_reward_func/mean": -2.45902681350708, "rewards/rollout_reward_func/std": 0.8290045261383057, "sampling/importance_sampling_ratio/max": 1.6969612836837769, "sampling/importance_sampling_ratio/mean": 1.0004469156265259, "sampling/importance_sampling_ratio/min": 0.5009263157844543, "sampling/sampling_logp_difference/max": 0.36035799980163574, "sampling/sampling_logp_difference/mean": 0.02125256508588791, "step": 5, "step_time": 60.257389979000436 }, { "clip_ratio/high_max": 0.0026041667442768812, "clip_ratio/high_mean": 0.0013020833721384406, "clip_ratio/low_mean": 0.0013586956774815917, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026607790496200323, "entropy": 0.5545530468225479, "epoch": 0.00012, "grad_norm": 2.07369065284729, "kl": 0.0011546705500222743, "learning_rate": 1.4285714285714286e-06, "loss": 0.0921, "step": 6, "step_time": 11.93783174300097 }, { "clip_ratio/high_max": 0.0026041667442768812, "clip_ratio/high_mean": 0.0013020833721384406, "clip_ratio/low_mean": 0.0013020833721384406, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041667442768812, "completions/clipped_ratio": 0.0, "completions/max_length": 4363.0, "completions/max_terminated_length": 4363.0, "completions/mean_length": 3765.53125, "completions/mean_terminated_length": 3765.53125, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "entropy": 0.573919378221035, "epoch": 0.00014, "frac_reward_zero_std": 0.0, "grad_norm": 2.3299458026885986, "kl": 0.0011782783294620458, "learning_rate": 1.7142857142857145e-06, "loss": 0.0167, "num_tokens": 589832.0, "reward": -2.46976375579834, "reward_std": 1.2192020416259766, "rewards/rollout_reward_func/mean": -2.46976375579834, "rewards/rollout_reward_func/std": 1.360211730003357, "sampling/importance_sampling_ratio/max": 1.4216840267181396, "sampling/importance_sampling_ratio/mean": 0.9407027959823608, "sampling/importance_sampling_ratio/min": 0.5079315900802612, "sampling/sampling_logp_difference/max": 0.42226099967956543, "sampling/sampling_logp_difference/mean": 0.022609494626522064, "step": 7, "step_time": 54.58579348800049 }, { "clip_ratio/high_max": 0.007015306269749999, "clip_ratio/high_mean": 0.005070153274573386, "clip_ratio/low_mean": 0.0013020833721384406, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006372236646711826, "entropy": 0.5752841830253601, "epoch": 0.00016, "grad_norm": 2.360698938369751, "kl": 0.001157285601948388, "learning_rate": 2.0000000000000003e-06, "loss": 0.0157, "step": 8, "step_time": 12.033116390001851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0026041667442768812, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041667442768812, "completions/clipped_ratio": 0.0, "completions/max_length": 4305.0, "completions/max_terminated_length": 4305.0, "completions/mean_length": 3619.4375, "completions/mean_terminated_length": 3619.4375, "completions/min_length": 1128.0, "completions/min_terminated_length": 1128.0, "entropy": 0.5393632538616657, "epoch": 0.00018, "frac_reward_zero_std": 0.0, "grad_norm": 2.6401772499084473, "kl": 0.0009530504758004099, "learning_rate": 2.285714285714286e-06, "loss": 0.0927, "num_tokens": 729170.0, "reward": -2.326364517211914, "reward_std": 1.2151319980621338, "rewards/rollout_reward_func/mean": -2.326364517211914, "rewards/rollout_reward_func/std": 1.404052972793579, "sampling/importance_sampling_ratio/max": 1.7163654565811157, "sampling/importance_sampling_ratio/mean": 0.9872620105743408, "sampling/importance_sampling_ratio/min": 0.5921608805656433, "sampling/sampling_logp_difference/max": 0.5117590427398682, "sampling/sampling_logp_difference/mean": 0.024541478604078293, "step": 9, "step_time": 53.8118479720024 }, { "clip_ratio/high_max": 0.0026041667442768812, "clip_ratio/high_mean": 0.0013020833721384406, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013020833721384406, "entropy": 0.5393857695162296, "epoch": 0.0002, "grad_norm": 2.6921041011810303, "kl": 0.0009762203881109599, "learning_rate": 2.571428571428571e-06, "loss": 0.0925, "step": 10, "step_time": 12.179582908999691 }, { "clip_ratio/high_max": 0.0026041667442768812, "clip_ratio/high_mean": 0.0013020833721384406, "clip_ratio/low_mean": 0.0013020833721384406, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041667442768812, "completions/clipped_ratio": 0.0, "completions/max_length": 4316.0, "completions/max_terminated_length": 4316.0, "completions/mean_length": 3972.75, "completions/mean_terminated_length": 3972.75, "completions/min_length": 2992.0, "completions/min_terminated_length": 2992.0, "entropy": 0.5662294253706932, "epoch": 0.00022, "frac_reward_zero_std": 0.0, "grad_norm": 2.0718979835510254, "kl": 0.0011709001882991288, "learning_rate": 2.8571428571428573e-06, "loss": -0.0709, "num_tokens": 879951.0, "reward": -2.6862502098083496, "reward_std": 0.7006374597549438, "rewards/rollout_reward_func/mean": -2.6862502098083496, "rewards/rollout_reward_func/std": 0.7967818379402161, "sampling/importance_sampling_ratio/max": 1.4347306489944458, "sampling/importance_sampling_ratio/mean": 0.9685467481613159, "sampling/importance_sampling_ratio/min": 0.6337878704071045, "sampling/sampling_logp_difference/max": 0.2577688694000244, "sampling/sampling_logp_difference/mean": 0.02011028863489628, "step": 11, "step_time": 56.197459413000615 }, { "clip_ratio/high_max": 0.005055147223174572, "clip_ratio/high_mean": 0.002527573611587286, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002527573611587286, "entropy": 0.5676523223519325, "epoch": 0.00024, "grad_norm": 2.133882522583008, "kl": 0.0011278250749455765, "learning_rate": 3.142857142857143e-06, "loss": -0.0707, "step": 12, "step_time": 12.02025387499998 }, { "clip_ratio/high_max": 0.0026041667442768812, "clip_ratio/high_mean": 0.0013020833721384406, "clip_ratio/low_mean": 0.0013020833721384406, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041667442768812, "completions/clipped_ratio": 0.0, "completions/max_length": 4299.0, "completions/max_terminated_length": 4299.0, "completions/mean_length": 3730.71875, "completions/mean_terminated_length": 3730.71875, "completions/min_length": 1093.0, "completions/min_terminated_length": 1093.0, "entropy": 0.5347183495759964, "epoch": 0.00026, "frac_reward_zero_std": 0.0, "grad_norm": 2.1146907806396484, "kl": 0.0010681173953344114, "learning_rate": 3.428571428571429e-06, "loss": 0.1176, "num_tokens": 1023348.0, "reward": -2.0156006813049316, "reward_std": 1.2150049209594727, "rewards/rollout_reward_func/mean": -2.0156006813049316, "rewards/rollout_reward_func/std": 1.5709649324417114, "sampling/importance_sampling_ratio/max": 1.7714853286743164, "sampling/importance_sampling_ratio/mean": 1.0881149768829346, "sampling/importance_sampling_ratio/min": 0.5891880989074707, "sampling/sampling_logp_difference/max": 0.34902477264404297, "sampling/sampling_logp_difference/mean": 0.018774278461933136, "step": 13, "step_time": 55.21850023600109 } ], "logging_steps": 1.0, "max_steps": 200000, "num_input_tokens_seen": 1023348, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }