{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 746, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05, "completions/max_length": 246.3, "completions/max_terminated_length": 240.26, "completions/mean_length": 195.0275, "completions/mean_terminated_length": 191.87969146728517, "completions/min_length": 147.76, "completions/min_terminated_length": 147.76, "entropy": 0.06807037293910981, "epoch": 0.06702412868632708, "frac_reward_zero_std": 0.4475, "grad_norm": 0.1978774070739746, "learning_rate": 1e-05, "loss": -0.0022, "num_tokens": 6268258.0, "reward": 12.489985446929932, "reward_std": 1.05244723290205, "rewards/event_reward_fn/mean": 11.62375, "rewards/event_reward_fn/std": 7.598931360244751, "rewards/format_reward_fn/mean": 0.8662354218959808, "rewards/format_reward_fn/std": 0.24084076710045338, "step": 50, "step_time": 24.881226640827954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.043125, "completions/max_length": 249.38, "completions/max_terminated_length": 244.26, "completions/mean_length": 198.4925, "completions/mean_terminated_length": 195.8674203491211, "completions/min_length": 155.1, "completions/min_terminated_length": 155.1, "entropy": 0.07096008479595184, "epoch": 0.13404825737265416, "frac_reward_zero_std": 0.42, "grad_norm": 0.31616032123565674, "learning_rate": 1e-05, "loss": -0.0052, "num_tokens": 12603730.0, "reward": 11.722552404403686, "reward_std": 1.104598103761673, "rewards/event_reward_fn/mean": 10.865, "rewards/event_reward_fn/std": 7.203483366966248, "rewards/format_reward_fn/mean": 0.8575523483753205, "rewards/format_reward_fn/std": 0.25920433282852173, "step": 100, "step_time": 23.881343694739044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.069375, "completions/max_length": 251.04, "completions/max_terminated_length": 245.08, "completions/mean_length": 201.58125, "completions/mean_terminated_length": 197.60445678710937, "completions/min_length": 157.96, "completions/min_terminated_length": 157.96, "entropy": 0.07228697955608368, "epoch": 0.20107238605898123, "frac_reward_zero_std": 0.41, "grad_norm": 0.1767224669456482, "learning_rate": 1e-05, "loss": 0.002, "num_tokens": 19236102.0, "reward": 11.989666719436645, "reward_std": 1.2850025883316993, "rewards/event_reward_fn/mean": 11.1225, "rewards/event_reward_fn/std": 7.3152674865722656, "rewards/format_reward_fn/mean": 0.8671666479110718, "rewards/format_reward_fn/std": 0.24983404949307442, "step": 150, "step_time": 27.783113366477192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.068125, "completions/max_length": 250.62, "completions/max_terminated_length": 244.54, "completions/mean_length": 201.1025, "completions/mean_terminated_length": 197.25198516845703, "completions/min_length": 156.4, "completions/min_terminated_length": 156.4, "entropy": 0.06773373357951641, "epoch": 0.2680965147453083, "frac_reward_zero_std": 0.415, "grad_norm": 0.13261352479457855, "learning_rate": 1e-05, "loss": -0.0029, "num_tokens": 25426958.0, "reward": 12.467143926620484, "reward_std": 1.1554639112949372, "rewards/event_reward_fn/mean": 11.59875, "rewards/event_reward_fn/std": 7.149877543449402, "rewards/format_reward_fn/mean": 0.8683938610553742, "rewards/format_reward_fn/std": 0.24253679752349855, "step": 200, "step_time": 24.421198091395198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.058125, "completions/max_length": 250.22, "completions/max_terminated_length": 243.8, "completions/mean_length": 200.303125, "completions/mean_terminated_length": 196.90248321533204, "completions/min_length": 162.42, "completions/min_terminated_length": 162.42, "entropy": 0.06486415289342404, "epoch": 0.3351206434316354, "frac_reward_zero_std": 0.385, "grad_norm": 0.49442073702812195, "learning_rate": 1e-05, "loss": -0.0036, "num_tokens": 31582342.0, "reward": 12.355808296203612, "reward_std": 1.1142808997631073, "rewards/event_reward_fn/mean": 11.48875, "rewards/event_reward_fn/std": 7.448825697898865, "rewards/format_reward_fn/mean": 0.8670582604408265, "rewards/format_reward_fn/std": 0.24978963822126388, "step": 250, "step_time": 25.453000083304943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04875, "completions/max_length": 248.68, "completions/max_terminated_length": 244.22, "completions/mean_length": 198.759375, "completions/mean_terminated_length": 196.16592681884765, "completions/min_length": 156.2, "completions/min_terminated_length": 156.2, "entropy": 0.0681518343836069, "epoch": 0.40214477211796246, "frac_reward_zero_std": 0.39, "grad_norm": 0.48775437474250793, "learning_rate": 1e-05, "loss": -0.0057, "num_tokens": 37800719.0, "reward": 12.434584522247315, "reward_std": 1.183589797616005, "rewards/event_reward_fn/mean": 11.56375, "rewards/event_reward_fn/std": 7.52141658782959, "rewards/format_reward_fn/mean": 0.8708344352245331, "rewards/format_reward_fn/std": 0.23306368254125118, "step": 300, "step_time": 25.360634116120636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.034375, "completions/max_length": 248.34, "completions/max_terminated_length": 245.28, "completions/mean_length": 203.264375, "completions/mean_terminated_length": 201.32774475097656, "completions/min_length": 157.54, "completions/min_terminated_length": 157.54, "entropy": 0.06739457175135613, "epoch": 0.4691689008042895, "frac_reward_zero_std": 0.3525, "grad_norm": 0.33356958627700806, "learning_rate": 1e-05, "loss": -0.004, "num_tokens": 44150011.0, "reward": 13.173797435760498, "reward_std": 1.2946509444713592, "rewards/event_reward_fn/mean": 12.28875, "rewards/event_reward_fn/std": 7.145490102767944, "rewards/format_reward_fn/mean": 0.885047378540039, "rewards/format_reward_fn/std": 0.22108205765485764, "step": 350, "step_time": 26.940150288008155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.064375, "completions/max_length": 252.9, "completions/max_terminated_length": 247.8, "completions/mean_length": 203.064375, "completions/mean_terminated_length": 199.5350747680664, "completions/min_length": 158.26, "completions/min_terminated_length": 158.26, "entropy": 0.0657703248411417, "epoch": 0.5361930294906166, "frac_reward_zero_std": 0.435, "grad_norm": 0.26359474658966064, "learning_rate": 1e-05, "loss": -0.0021, "num_tokens": 50384400.0, "reward": 12.238037357330322, "reward_std": 1.057584773004055, "rewards/event_reward_fn/mean": 11.37, "rewards/event_reward_fn/std": 7.154304637908935, "rewards/format_reward_fn/mean": 0.8680373668670655, "rewards/format_reward_fn/std": 0.26109003871679304, "step": 400, "step_time": 25.59800311360508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.049375, "completions/max_length": 249.06, "completions/max_terminated_length": 244.9, "completions/mean_length": 203.706875, "completions/mean_terminated_length": 200.99220581054686, "completions/min_length": 161.06, "completions/min_terminated_length": 161.06, "entropy": 0.06626586891710758, "epoch": 0.6032171581769437, "frac_reward_zero_std": 0.3775, "grad_norm": 0.48660293221473694, "learning_rate": 1e-05, "loss": -0.004, "num_tokens": 56771056.0, "reward": 13.009743461608887, "reward_std": 1.2429037857055665, "rewards/event_reward_fn/mean": 12.130625, "rewards/event_reward_fn/std": 7.234463820457458, "rewards/format_reward_fn/mean": 0.8791184043884277, "rewards/format_reward_fn/std": 0.23800445690751076, "step": 450, "step_time": 25.550446799769997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.066875, "completions/max_length": 251.92, "completions/max_terminated_length": 246.12, "completions/mean_length": 204.07625, "completions/mean_terminated_length": 200.35590240478516, "completions/min_length": 160.74, "completions/min_terminated_length": 160.74, "entropy": 0.06663089752197265, "epoch": 0.6702412868632708, "frac_reward_zero_std": 0.4025, "grad_norm": 0.6319305300712585, "learning_rate": 1e-05, "loss": -0.0042, "num_tokens": 63078757.0, "reward": 12.313038005828858, "reward_std": 1.1368902394175529, "rewards/event_reward_fn/mean": 11.4575, "rewards/event_reward_fn/std": 6.7143393945693965, "rewards/format_reward_fn/mean": 0.8555380630493165, "rewards/format_reward_fn/std": 0.2657873314619064, "step": 500, "step_time": 26.24973841637373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.091875, "completions/max_length": 252.82, "completions/max_terminated_length": 246.88, "completions/mean_length": 203.815, "completions/mean_terminated_length": 198.69242126464843, "completions/min_length": 161.16, "completions/min_terminated_length": 161.16, "entropy": 0.06187104433774948, "epoch": 0.7372654155495979, "frac_reward_zero_std": 0.425, "grad_norm": 0.40395304560661316, "learning_rate": 1e-05, "loss": -0.0025, "num_tokens": 69170452.0, "reward": 12.482298536300659, "reward_std": 1.0457301473617553, "rewards/event_reward_fn/mean": 11.64625, "rewards/event_reward_fn/std": 7.317771224975586, "rewards/format_reward_fn/mean": 0.8360484623908997, "rewards/format_reward_fn/std": 0.2895883430540562, "step": 550, "step_time": 24.193240740820766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.110625, "completions/max_length": 252.62, "completions/max_terminated_length": 246.8, "completions/mean_length": 208.275625, "completions/mean_terminated_length": 202.49910614013672, "completions/min_length": 165.54, "completions/min_terminated_length": 165.54, "entropy": 0.0649487990140915, "epoch": 0.8042895442359249, "frac_reward_zero_std": 0.38, "grad_norm": 0.37119486927986145, "learning_rate": 1e-05, "loss": 0.0006, "num_tokens": 75499314.0, "reward": 12.80059557914734, "reward_std": 1.1889909988641738, "rewards/event_reward_fn/mean": 11.97375, "rewards/event_reward_fn/std": 7.475857477188111, "rewards/format_reward_fn/mean": 0.8268455564975739, "rewards/format_reward_fn/std": 0.29714462146162984, "step": 600, "step_time": 24.3176869976148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05625, "completions/max_length": 249.28, "completions/max_terminated_length": 244.8, "completions/mean_length": 202.789375, "completions/mean_terminated_length": 199.91522064208985, "completions/min_length": 161.74, "completions/min_terminated_length": 161.74, "entropy": 0.06481640346348286, "epoch": 0.871313672922252, "frac_reward_zero_std": 0.3975, "grad_norm": 0.08866075426340103, "learning_rate": 1e-05, "loss": -0.0023, "num_tokens": 81673001.0, "reward": 12.689926280975342, "reward_std": 1.2458794575929641, "rewards/event_reward_fn/mean": 11.815625, "rewards/event_reward_fn/std": 7.275726590156555, "rewards/format_reward_fn/mean": 0.8743013119697571, "rewards/format_reward_fn/std": 0.23756251022219657, "step": 650, "step_time": 25.04028965227306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.100625, "completions/max_length": 253.72, "completions/max_terminated_length": 248.28, "completions/mean_length": 205.536875, "completions/mean_terminated_length": 200.1349432373047, "completions/min_length": 162.16, "completions/min_terminated_length": 162.16, "entropy": 0.0658975774794817, "epoch": 0.938337801608579, "frac_reward_zero_std": 0.3975, "grad_norm": 0.2268964648246765, "learning_rate": 1e-05, "loss": -0.0008, "num_tokens": 87934795.0, "reward": 12.72035478591919, "reward_std": 1.1722034803032875, "rewards/event_reward_fn/mean": 11.888125, "rewards/event_reward_fn/std": 7.583159003257752, "rewards/format_reward_fn/mean": 0.8322297859191895, "rewards/format_reward_fn/std": 0.29026631206274034, "step": 700, "step_time": 24.744350045956672 } ], "logging_steps": 50, "max_steps": 7460, "num_input_tokens_seen": 93493541, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }