MMattaparthy's picture
Upload reward model
12bf163 verified
{
"best_global_step": 100,
"best_metric": 7.878235010139178e-06,
"best_model_checkpoint": "models/reward-model/checkpoint-100",
"epoch": 3.0,
"eval_steps": 50,
"global_step": 111,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"accuracy": 0.475,
"epoch": 0.273972602739726,
"grad_norm": 30.625,
"learning_rate": 9.000000000000001e-07,
"loss": 0.6891,
"margin": 0.01259765625,
"max_reward": 0.8927734375,
"mean_reward": 0.841064453125,
"min_reward": 0.78935546875,
"num_tokens": 33836.0,
"step": 10
},
{
"accuracy": 0.7,
"epoch": 0.547945205479452,
"grad_norm": 31.0,
"learning_rate": 1.9000000000000002e-06,
"loss": 0.6556,
"margin": 0.08076171875,
"max_reward": 0.940087890625,
"mean_reward": 0.881591796875,
"min_reward": 0.823095703125,
"num_tokens": 68438.0,
"step": 20
},
{
"accuracy": 0.875,
"epoch": 0.821917808219178,
"grad_norm": 42.75,
"learning_rate": 2.9e-06,
"loss": 0.5442,
"margin": 0.348681640625,
"max_reward": 1.237109375,
"mean_reward": 1.0571533203125,
"min_reward": 0.877197265625,
"num_tokens": 102129.0,
"step": 30
},
{
"accuracy": 0.9473684210526315,
"epoch": 1.0821917808219177,
"grad_norm": 33.5,
"learning_rate": 3.900000000000001e-06,
"loss": 0.3195,
"margin": 1.2310598273026316,
"max_reward": 2.8731496710526314,
"mean_reward": 2.2488820929276314,
"min_reward": 1.6246145148026316,
"num_tokens": 134446.0,
"step": 40
},
{
"accuracy": 1.0,
"epoch": 1.356164383561644,
"grad_norm": 0.357421875,
"learning_rate": 4.9000000000000005e-06,
"loss": 0.0364,
"margin": 6.142855834960938,
"max_reward": 5.8197265625,
"mean_reward": 2.7482986450195312,
"min_reward": -0.3231292724609375,
"num_tokens": 168483.0,
"step": 50
},
{
"epoch": 1.356164383561644,
"eval_accuracy": 1.0,
"eval_loss": 9.938376024365425e-05,
"eval_margin": 12.761437618371213,
"eval_max_reward": 9.329545454545455,
"eval_mean_reward": 2.9488266453598486,
"eval_min_reward": -3.431892163825758,
"eval_num_tokens": 168483.0,
"eval_runtime": 1.3469,
"eval_samples_per_second": 24.501,
"eval_steps_per_second": 24.501,
"step": 50
},
{
"accuracy": 1.0,
"epoch": 1.6301369863013697,
"grad_norm": 2.86102294921875e-05,
"learning_rate": 4.736217705571989e-06,
"loss": 0.0,
"margin": 17.22467498779297,
"max_reward": 11.6734375,
"mean_reward": 3.0611000061035156,
"min_reward": -5.551237487792969,
"num_tokens": 202410.0,
"step": 60
},
{
"accuracy": 1.0,
"epoch": 1.904109589041096,
"grad_norm": 0.2412109375,
"learning_rate": 3.895609305067162e-06,
"loss": 0.0001,
"margin": 19.885546875,
"max_reward": 12.878125,
"mean_reward": 2.9353515625,
"min_reward": -7.007421875,
"num_tokens": 236144.0,
"step": 70
},
{
"accuracy": 1.0,
"epoch": 2.1643835616438354,
"grad_norm": 0.00116729736328125,
"learning_rate": 2.6929386553166165e-06,
"loss": 0.0,
"margin": 19.66786595394737,
"max_reward": 12.293071546052632,
"mean_reward": 2.4591385690789473,
"min_reward": -7.374794407894737,
"num_tokens": 268578.0,
"step": 80
},
{
"accuracy": 1.0,
"epoch": 2.4383561643835616,
"grad_norm": 2.396106719970703e-05,
"learning_rate": 1.4402140232253486e-06,
"loss": 0.0,
"margin": 20.334765625,
"max_reward": 12.90859375,
"mean_reward": 2.7412109375,
"min_reward": -7.426171875,
"num_tokens": 303010.0,
"step": 90
},
{
"accuracy": 1.0,
"epoch": 2.712328767123288,
"grad_norm": 0.00677490234375,
"learning_rate": 4.624291562079719e-07,
"loss": 0.0,
"margin": 19.507958984375,
"max_reward": 12.3116943359375,
"mean_reward": 2.55771484375,
"min_reward": -7.1962646484375,
"num_tokens": 336849.0,
"step": 100
},
{
"epoch": 2.712328767123288,
"eval_accuracy": 1.0,
"eval_loss": 7.878235010139178e-06,
"eval_margin": 19.414299242424242,
"eval_max_reward": 12.067708333333334,
"eval_mean_reward": 2.360558712121212,
"eval_min_reward": -7.346590909090909,
"eval_num_tokens": 336849.0,
"eval_runtime": 1.3603,
"eval_samples_per_second": 24.26,
"eval_steps_per_second": 24.26,
"step": 100
},
{
"accuracy": 1.0,
"epoch": 2.9863013698630136,
"grad_norm": 6.866455078125e-05,
"learning_rate": 1.3250310963527358e-08,
"loss": 0.0,
"margin": 20.3682373046875,
"max_reward": 12.6564453125,
"mean_reward": 2.47232666015625,
"min_reward": -7.7117919921875,
"num_tokens": 370214.0,
"step": 110
}
],
"logging_steps": 10,
"max_steps": 111,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3167290892685312.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}