{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9655172413793104, "eval_steps": 500, "global_step": 14, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 396.88172149658203, "epoch": 0.06896551724137931, "grad_norm": 0.3486388921737671, "kl": 0.0, "learning_rate": 1e-05, "loss": -0.0087, "reward": 0.337053582072258, "reward_std": 0.7216125279664993, "rewards/": 0.3169643059372902, "rewards/format_reward": 0.020089286845177412, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 402.25001525878906, "epoch": 0.13793103448275862, "grad_norm": 0.31598225235939026, "kl": 0.0, "learning_rate": 2e-05, "loss": -0.0083, "reward": 0.3638393059372902, "reward_std": 0.7311068773269653, "rewards/": 0.3526785895228386, "rewards/format_reward": 0.011160714784637094, "step": 2 }, { "clip_ratio": 0.0, "completion_length": 401.2143020629883, "epoch": 0.20689655172413793, "grad_norm": 0.3833157420158386, "kl": 0.008453369140625, "learning_rate": 1.9659258262890683e-05, "loss": 0.0091, "reward": 0.3660714477300644, "reward_std": 0.700179249048233, "rewards/": 0.3392857238650322, "rewards/format_reward": 0.026785716181620955, "step": 3 }, { "clip_ratio": 0.0, "completion_length": 300.3839416503906, "epoch": 0.27586206896551724, "grad_norm": 131.17250061035156, "kl": 1.0369873046875, "learning_rate": 1.866025403784439e-05, "loss": 0.0823, "reward": 0.486607164144516, "reward_std": 0.8625286668539047, "rewards/": 0.4508928880095482, "rewards/format_reward": 0.035714288242161274, "step": 4 }, { "clip_ratio": 0.0, "completion_length": 373.6428680419922, "epoch": 0.3448275862068966, "grad_norm": 3.0509426593780518, "kl": 0.2314453125, "learning_rate": 1.7071067811865477e-05, "loss": -0.0147, "reward": 0.6294643133878708, "reward_std": 0.9069008827209473, "rewards/": 0.5178571566939354, "rewards/format_reward": 0.11160714738070965, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 426.9553756713867, "epoch": 0.41379310344827586, "grad_norm": 4.6992268562316895, "kl": 0.201904296875, "learning_rate": 1.5000000000000002e-05, "loss": 0.0026, "reward": 0.558035746216774, "reward_std": 0.906622901558876, "rewards/": 0.5178571566939354, "rewards/format_reward": 0.0401785746216774, "step": 6 }, { "clip_ratio": 0.0, "completion_length": 455.0915298461914, "epoch": 0.4827586206896552, "grad_norm": 0.3951745331287384, "kl": 0.177734375, "learning_rate": 1.2588190451025209e-05, "loss": 0.001, "reward": 0.4508928805589676, "reward_std": 0.8413252383470535, "rewards/": 0.4508928805589676, "rewards/format_reward": 0.0, "step": 7 }, { "clip_ratio": 0.0, "completion_length": 450.3794860839844, "epoch": 0.5517241379310345, "grad_norm": 0.369406521320343, "kl": 0.164306640625, "learning_rate": 1e-05, "loss": 0.0424, "reward": 0.500000037252903, "reward_std": 0.8601988106966019, "rewards/": 0.500000037252903, "rewards/format_reward": 0.0, "step": 8 }, { "clip_ratio": 0.0, "completion_length": 423.6205596923828, "epoch": 0.6206896551724138, "grad_norm": 0.39313390851020813, "kl": 0.155029296875, "learning_rate": 7.411809548974792e-06, "loss": -0.0279, "reward": 0.5223214626312256, "reward_std": 0.8841909915208817, "rewards/": 0.5223214626312256, "rewards/format_reward": 0.0, "step": 9 }, { "clip_ratio": 0.0, "completion_length": 438.4888610839844, "epoch": 0.6896551724137931, "grad_norm": 0.3800601661205292, "kl": 0.149169921875, "learning_rate": 5.000000000000003e-06, "loss": -0.0136, "reward": 0.549107164144516, "reward_std": 0.8823030292987823, "rewards/": 0.549107164144516, "rewards/format_reward": 0.0, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 398.26341247558594, "epoch": 0.7586206896551724, "grad_norm": 0.3832401931285858, "kl": 0.141845703125, "learning_rate": 2.9289321881345257e-06, "loss": -0.0314, "reward": 0.5089286118745804, "reward_std": 0.8833121657371521, "rewards/": 0.5089286118745804, "rewards/format_reward": 0.0, "step": 11 }, { "clip_ratio": 0.0, "completion_length": 424.11832427978516, "epoch": 0.8275862068965517, "grad_norm": 0.3395453095436096, "kl": 0.134521484375, "learning_rate": 1.339745962155613e-06, "loss": -0.0237, "reward": 0.5758928954601288, "reward_std": 0.9218239635229111, "rewards/": 0.5758928954601288, "rewards/format_reward": 0.0, "step": 12 }, { "clip_ratio": 0.0, "completion_length": 397.5379638671875, "epoch": 0.896551724137931, "grad_norm": 0.3803483247756958, "kl": 0.1341552734375, "learning_rate": 3.4074173710931804e-07, "loss": -0.0323, "reward": 0.5625000149011612, "reward_std": 0.9007573574781418, "rewards/": 0.5625000149011612, "rewards/format_reward": 0.0, "step": 13 }, { "clip_ratio": 0.0, "completion_length": 413.578125, "epoch": 0.9655172413793104, "grad_norm": 0.33767423033714294, "kl": 0.13232421875, "learning_rate": 0.0, "loss": -0.0033, "reward": 0.5558035969734192, "reward_std": 0.8983689099550247, "rewards/": 0.5535714477300644, "rewards/format_reward": 0.0022321429569274187, "step": 14 }, { "epoch": 0.9655172413793104, "step": 14, "total_flos": 0.0, "train_loss": -0.0018907431845686265, "train_runtime": 2665.9182, "train_samples_per_second": 0.15, "train_steps_per_second": 0.005 } ], "logging_steps": 1, "max_steps": 14, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }