{ "best_global_step": 100, "best_metric": 7.878235010139178e-06, "best_model_checkpoint": "models/reward-model/checkpoint-100", "epoch": 3.0, "eval_steps": 50, "global_step": 111, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "accuracy": 0.475, "epoch": 0.273972602739726, "grad_norm": 30.625, "learning_rate": 9.000000000000001e-07, "loss": 0.6891, "margin": 0.01259765625, "max_reward": 0.8927734375, "mean_reward": 0.841064453125, "min_reward": 0.78935546875, "num_tokens": 33836.0, "step": 10 }, { "accuracy": 0.7, "epoch": 0.547945205479452, "grad_norm": 31.0, "learning_rate": 1.9000000000000002e-06, "loss": 0.6556, "margin": 0.08076171875, "max_reward": 0.940087890625, "mean_reward": 0.881591796875, "min_reward": 0.823095703125, "num_tokens": 68438.0, "step": 20 }, { "accuracy": 0.875, "epoch": 0.821917808219178, "grad_norm": 42.75, "learning_rate": 2.9e-06, "loss": 0.5442, "margin": 0.348681640625, "max_reward": 1.237109375, "mean_reward": 1.0571533203125, "min_reward": 0.877197265625, "num_tokens": 102129.0, "step": 30 }, { "accuracy": 0.9473684210526315, "epoch": 1.0821917808219177, "grad_norm": 33.5, "learning_rate": 3.900000000000001e-06, "loss": 0.3195, "margin": 1.2310598273026316, "max_reward": 2.8731496710526314, "mean_reward": 2.2488820929276314, "min_reward": 1.6246145148026316, "num_tokens": 134446.0, "step": 40 }, { "accuracy": 1.0, "epoch": 1.356164383561644, "grad_norm": 0.357421875, "learning_rate": 4.9000000000000005e-06, "loss": 0.0364, "margin": 6.142855834960938, "max_reward": 5.8197265625, "mean_reward": 2.7482986450195312, "min_reward": -0.3231292724609375, "num_tokens": 168483.0, "step": 50 }, { "epoch": 1.356164383561644, "eval_accuracy": 1.0, "eval_loss": 9.938376024365425e-05, "eval_margin": 12.761437618371213, "eval_max_reward": 9.329545454545455, "eval_mean_reward": 2.9488266453598486, "eval_min_reward": -3.431892163825758, "eval_num_tokens": 168483.0, "eval_runtime": 1.3469, "eval_samples_per_second": 24.501, "eval_steps_per_second": 24.501, "step": 50 }, { "accuracy": 1.0, "epoch": 1.6301369863013697, "grad_norm": 2.86102294921875e-05, "learning_rate": 4.736217705571989e-06, "loss": 0.0, "margin": 17.22467498779297, "max_reward": 11.6734375, "mean_reward": 3.0611000061035156, "min_reward": -5.551237487792969, "num_tokens": 202410.0, "step": 60 }, { "accuracy": 1.0, "epoch": 1.904109589041096, "grad_norm": 0.2412109375, "learning_rate": 3.895609305067162e-06, "loss": 0.0001, "margin": 19.885546875, "max_reward": 12.878125, "mean_reward": 2.9353515625, "min_reward": -7.007421875, "num_tokens": 236144.0, "step": 70 }, { "accuracy": 1.0, "epoch": 2.1643835616438354, "grad_norm": 0.00116729736328125, "learning_rate": 2.6929386553166165e-06, "loss": 0.0, "margin": 19.66786595394737, "max_reward": 12.293071546052632, "mean_reward": 2.4591385690789473, "min_reward": -7.374794407894737, "num_tokens": 268578.0, "step": 80 }, { "accuracy": 1.0, "epoch": 2.4383561643835616, "grad_norm": 2.396106719970703e-05, "learning_rate": 1.4402140232253486e-06, "loss": 0.0, "margin": 20.334765625, "max_reward": 12.90859375, "mean_reward": 2.7412109375, "min_reward": -7.426171875, "num_tokens": 303010.0, "step": 90 }, { "accuracy": 1.0, "epoch": 2.712328767123288, "grad_norm": 0.00677490234375, "learning_rate": 4.624291562079719e-07, "loss": 0.0, "margin": 19.507958984375, "max_reward": 12.3116943359375, "mean_reward": 2.55771484375, "min_reward": -7.1962646484375, "num_tokens": 336849.0, "step": 100 }, { "epoch": 2.712328767123288, "eval_accuracy": 1.0, "eval_loss": 7.878235010139178e-06, "eval_margin": 19.414299242424242, "eval_max_reward": 12.067708333333334, "eval_mean_reward": 2.360558712121212, "eval_min_reward": -7.346590909090909, "eval_num_tokens": 336849.0, "eval_runtime": 1.3603, "eval_samples_per_second": 24.26, "eval_steps_per_second": 24.26, "step": 100 }, { "accuracy": 1.0, "epoch": 2.9863013698630136, "grad_norm": 6.866455078125e-05, "learning_rate": 1.3250310963527358e-08, "loss": 0.0, "margin": 20.3682373046875, "max_reward": 12.6564453125, "mean_reward": 2.47232666015625, "min_reward": -7.7117919921875, "num_tokens": 370214.0, "step": 110 } ], "logging_steps": 10, "max_steps": 111, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3167290892685312.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }