{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.1504, "eval_steps": 50, "global_step": 47, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0298828125, "completions/max_length": 1536.0, "completions/max_terminated_length": 1387.8, "completions/mean_length": 117.6021484375, "completions/mean_terminated_length": 73.92077941894532, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.016, "grad_norm": 0.0009012475493364036, "learning_rate": 1e-06, "loss": -0.0023, "num_tokens": 12904598.0, "reward": 0.003271484375, "reward_std": 0.016773892380297185, "rewards/accuracy_reward": 0.0001953125, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.00634765625, "rewards/mean_confidence_reward": 0.0, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04052734375, "completions/max_length": 1536.0, "completions/max_terminated_length": 1399.0, "completions/mean_length": 144.84462890625, "completions/mean_terminated_length": 86.13726043701172, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "epoch": 0.032, "grad_norm": 0.005089475307613611, "learning_rate": 1e-06, "loss": -0.0069, "num_tokens": 26344479.0, "reward": 0.02197265625, "reward_std": 0.0738394245505333, "rewards/accuracy_reward": 0.006640625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.0373046875, "rewards/mean_confidence_reward": 0.0, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05029296875, "completions/max_length": 1536.0, "completions/max_terminated_length": 1470.0, "completions/mean_length": 218.273046875, "completions/mean_terminated_length": 148.43575439453124, "completions/min_length": 1.6, "completions/min_terminated_length": 1.6, "epoch": 0.048, "grad_norm": 0.003542242106050253, "learning_rate": 1e-06, "loss": 0.0482, "num_tokens": 40484651.0, "reward": 0.2947265625, "reward_std": 0.2773519217967987, "rewards/accuracy_reward": 0.0857421875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.5037109375, "rewards/mean_confidence_reward": 0.0, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0291015625, "completions/max_length": 1536.0, "completions/max_terminated_length": 1409.0, "completions/mean_length": 187.0353515625, "completions/mean_terminated_length": 146.611474609375, "completions/min_length": 11.8, "completions/min_terminated_length": 11.8, "epoch": 0.064, "grad_norm": 0.0016335330437868834, "learning_rate": 1e-06, "loss": 0.0656, "num_tokens": 54174613.0, "reward": 0.524755859375, "reward_std": 0.23584804832935333, "rewards/accuracy_reward": 0.17412109375, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.875390625, "rewards/mean_confidence_reward": 0.0, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0296875, "completions/max_length": 1536.0, "completions/max_terminated_length": 1326.8, "completions/mean_length": 168.64677734375, "completions/mean_terminated_length": 126.81468353271484, "completions/min_length": 18.4, "completions/min_terminated_length": 18.4, "epoch": 0.08, "grad_norm": 0.001310898456722498, "learning_rate": 1e-06, "loss": 0.068, "num_tokens": 67691028.0, "reward": 0.577392578125, "reward_std": 0.1859208643436432, "rewards/accuracy_reward": 0.20341796875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9513671875, "rewards/mean_confidence_reward": 0.0, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02001953125, "completions/max_length": 1536.0, "completions/max_terminated_length": 1291.6, "completions/mean_length": 145.42451171875, "completions/mean_terminated_length": 117.01493835449219, "completions/min_length": 18.2, "completions/min_terminated_length": 18.2, "epoch": 0.096, "grad_norm": 0.001353453379124403, "learning_rate": 1e-06, "loss": 0.0497, "num_tokens": 81081103.0, "reward": 0.6099609375, "reward_std": 0.15915196239948273, "rewards/accuracy_reward": 0.24267578125, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.97724609375, "rewards/mean_confidence_reward": 0.0, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01162109375, "completions/max_length": 1536.0, "completions/max_terminated_length": 1185.4, "completions/mean_length": 123.5869140625, "completions/mean_terminated_length": 106.98431701660157, "completions/min_length": 23.4, "completions/min_terminated_length": 23.4, "epoch": 0.112, "grad_norm": 0.0013827328803017735, "learning_rate": 1e-06, "loss": 0.0296, "num_tokens": 94312425.0, "reward": 0.63564453125, "reward_std": 0.14015594720840455, "rewards/accuracy_reward": 0.2837890625, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9875, "rewards/mean_confidence_reward": 0.0, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00498046875, "completions/max_length": 1536.0, "completions/max_terminated_length": 955.8, "completions/mean_length": 107.708203125, "completions/mean_terminated_length": 100.56294860839844, "completions/min_length": 22.0, "completions/min_terminated_length": 22.0, "epoch": 0.128, "grad_norm": 0.0011756919557228684, "learning_rate": 1e-06, "loss": 0.0155, "num_tokens": 107188349.0, "reward": 0.64580078125, "reward_std": 0.12910378873348236, "rewards/accuracy_reward": 0.29716796875, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.99443359375, "rewards/mean_confidence_reward": 0.0, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.003515625, "completions/max_length": 1536.0, "completions/max_terminated_length": 936.2, "completions/mean_length": 101.2041015625, "completions/mean_terminated_length": 96.14408721923829, "completions/min_length": 23.4, "completions/min_terminated_length": 23.4, "epoch": 0.144, "grad_norm": 0.001248349086381495, "learning_rate": 1e-06, "loss": 0.0119, "num_tokens": 120031431.0, "reward": 0.6857421875, "reward_std": 0.13882396817207338, "rewards/accuracy_reward": 0.3751953125, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.9962890625, "rewards/mean_confidence_reward": 0.0, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0029296875, "completions/max_length": 1536.0, "completions/max_terminated_length": 787.0, "completions/mean_length": 102.634033203125, "completions/mean_terminated_length": 98.42235565185547, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.1504, "num_tokens": 125106124.0, "reward": 0.6888427734375, "reward_std": 0.13154470920562744, "rewards/accuracy_reward": 0.380859375, "rewards/brier_reward": 0.0, "rewards/confidence_one_or_zero": 0.0, "rewards/format_reward": 0.996826171875, "rewards/mean_confidence_reward": 0.0, "step": 47, "total_flos": 0.0, "train_loss": 0.030232181475359075, "train_runtime": 7886.8615, "train_samples_per_second": 0.38, "train_steps_per_second": 0.006 } ], "logging_steps": 5, "max_steps": 47, "num_input_tokens_seen": 125106124, "num_train_epochs": 1, "save_steps": 60, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }