two_agent_2_rdpo_iter_2 / trainer_state.json
YYYYYYibo's picture
Model save
3a95bf9 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9945,
"eval_steps": 500,
"global_step": 153,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01,
"eta": 0.0010000000474974513,
"grad_norm": 0.20925004399681482,
"learning_rate": 3.125e-08,
"logits/chosen": -1.6728180646896362,
"logits/rejected": -1.6728180646896362,
"logps/chosen": -139.26568603515625,
"logps/pi_response": -223.70187377929688,
"logps/ref_response": -223.70187377929688,
"logps/rejected": -139.26568603515625,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.07,
"eta": 0.0009999999310821295,
"grad_norm": 0.2664449122199502,
"learning_rate": 3.1249999999999997e-07,
"logits/chosen": -1.9198987483978271,
"logits/rejected": -1.9198987483978271,
"logps/chosen": -185.7984161376953,
"logps/pi_response": -284.7489929199219,
"logps/ref_response": -274.8498229980469,
"logps/rejected": -185.7984161376953,
"loss": 0.693,
"rewards/accuracies": 0.09829059988260269,
"rewards/chosen": -0.05739467218518257,
"rewards/margins": -8.19944467878031e-09,
"rewards/rejected": -0.057394664734601974,
"step": 10
},
{
"epoch": 0.13,
"eta": 0.0010000000474974513,
"grad_norm": 0.17607425906899,
"learning_rate": 4.989490450759331e-07,
"logits/chosen": -1.2133342027664185,
"logits/rejected": -1.2133342027664185,
"logps/chosen": -344.1175231933594,
"logps/pi_response": -420.7164001464844,
"logps/ref_response": -268.8954772949219,
"logps/rejected": -344.1175231933594,
"loss": 0.6916,
"rewards/accuracies": 0.17307692766189575,
"rewards/chosen": -1.5597572326660156,
"rewards/margins": 2.3523059056174134e-08,
"rewards/rejected": -1.5597573518753052,
"step": 20
},
{
"epoch": 0.2,
"eta": 0.0010000000474974513,
"grad_norm": 0.9959359173659126,
"learning_rate": 4.872270441827174e-07,
"logits/chosen": 0.10036426037549973,
"logits/rejected": 0.10036426037549973,
"logps/chosen": -764.1023559570312,
"logps/pi_response": -850.1804809570312,
"logps/ref_response": -272.0489807128906,
"logps/rejected": -764.1023559570312,
"loss": 0.6873,
"rewards/accuracies": 0.11153846234083176,
"rewards/chosen": -5.858819007873535,
"rewards/margins": -3.943076620771535e-08,
"rewards/rejected": -5.858819484710693,
"step": 30
},
{
"epoch": 0.26,
"eta": 0.0010000000474974513,
"grad_norm": 1.9030553381279938,
"learning_rate": 4.6308512113530063e-07,
"logits/chosen": 2.6020889282226562,
"logits/rejected": 2.6020889282226562,
"logps/chosen": -15368.376953125,
"logps/pi_response": -9728.86328125,
"logps/ref_response": -275.9498596191406,
"logps/rejected": -15368.376953125,
"loss": 0.6,
"rewards/accuracies": 0.042307693511247635,
"rewards/chosen": -151.69932556152344,
"rewards/margins": 1.8339891028062993e-07,
"rewards/rejected": -151.69932556152344,
"step": 40
},
{
"epoch": 0.33,
"eta": 0.0010000000474974513,
"grad_norm": 0.9022013856679871,
"learning_rate": 4.277872161641681e-07,
"logits/chosen": 5.100710391998291,
"logits/rejected": 5.100710391998291,
"logps/chosen": -25452.970703125,
"logps/pi_response": -16005.287109375,
"logps/ref_response": -266.91033935546875,
"logps/rejected": -25452.970703125,
"loss": 0.5304,
"rewards/accuracies": 0.023076923564076424,
"rewards/chosen": -252.7429656982422,
"rewards/margins": -4.69501202360334e-07,
"rewards/rejected": -252.7429656982422,
"step": 50
},
{
"epoch": 0.39,
"eta": 0.0010000000474974513,
"grad_norm": 0.9078413895498612,
"learning_rate": 3.8318133624280046e-07,
"logits/chosen": 4.621513843536377,
"logits/rejected": 4.621513843536377,
"logps/chosen": -29423.666015625,
"logps/pi_response": -17608.6015625,
"logps/ref_response": -265.94757080078125,
"logps/rejected": -29423.666015625,
"loss": 0.519,
"rewards/accuracies": 0.03076923079788685,
"rewards/chosen": -292.3346862792969,
"rewards/margins": 6.455641710090276e-07,
"rewards/rejected": -292.3346862792969,
"step": 60
},
{
"epoch": 0.46,
"eta": 0.0010000000474974513,
"grad_norm": 0.7756777568132545,
"learning_rate": 3.316028034595861e-07,
"logits/chosen": 4.356642723083496,
"logits/rejected": 4.356642723083496,
"logps/chosen": -29171.62890625,
"logps/pi_response": -18937.943359375,
"logps/ref_response": -276.5423278808594,
"logps/rejected": -29171.62890625,
"loss": 0.5212,
"rewards/accuracies": 0.015384615398943424,
"rewards/chosen": -289.7953796386719,
"rewards/margins": 0.0,
"rewards/rejected": -289.7953796386719,
"step": 70
},
{
"epoch": 0.52,
"eta": 0.0010000000474974513,
"grad_norm": 0.9174650913563618,
"learning_rate": 2.7575199021178855e-07,
"logits/chosen": 4.428848743438721,
"logits/rejected": 4.428848743438721,
"logps/chosen": -27704.791015625,
"logps/pi_response": -18176.86328125,
"logps/ref_response": -271.93060302734375,
"logps/rejected": -27704.791015625,
"loss": 0.5132,
"rewards/accuracies": 0.01923076994717121,
"rewards/chosen": -275.2251281738281,
"rewards/margins": -9.97690108306415e-07,
"rewards/rejected": -275.2251281738281,
"step": 80
},
{
"epoch": 0.58,
"eta": 0.0010000000474974513,
"grad_norm": 0.9019855937057101,
"learning_rate": 2.1855294234408068e-07,
"logits/chosen": 4.3772735595703125,
"logits/rejected": 4.3772735595703125,
"logps/chosen": -29193.177734375,
"logps/pi_response": -17440.9921875,
"logps/ref_response": -266.1241455078125,
"logps/rejected": -29193.177734375,
"loss": 0.5231,
"rewards/accuracies": 0.03076923079788685,
"rewards/chosen": -290.08026123046875,
"rewards/margins": -2.9343825147520874e-08,
"rewards/rejected": -290.0802307128906,
"step": 90
},
{
"epoch": 0.65,
"eta": 0.0010000000474974513,
"grad_norm": 0.8179873354250534,
"learning_rate": 1.6300029195778453e-07,
"logits/chosen": 4.375646114349365,
"logits/rejected": 4.375646114349365,
"logps/chosen": -27328.33203125,
"logps/pi_response": -17462.0546875,
"logps/ref_response": -267.173828125,
"logps/rejected": -27328.33203125,
"loss": 0.5171,
"rewards/accuracies": 0.026923077180981636,
"rewards/chosen": -271.51708984375,
"rewards/margins": 1.613910427522569e-07,
"rewards/rejected": -271.51708984375,
"step": 100
},
{
"epoch": 0.71,
"eta": 0.0010000000474974513,
"grad_norm": 0.9895798769556228,
"learning_rate": 1.1200247470632392e-07,
"logits/chosen": 4.529591083526611,
"logits/rejected": 4.529591083526611,
"logps/chosen": -28326.220703125,
"logps/pi_response": -18734.5078125,
"logps/ref_response": -291.685791015625,
"logps/rejected": -28326.220703125,
"loss": 0.5103,
"rewards/accuracies": 0.04615384712815285,
"rewards/chosen": -281.44744873046875,
"rewards/margins": 1.540550869094659e-07,
"rewards/rejected": -281.44744873046875,
"step": 110
},
{
"epoch": 0.78,
"eta": 0.0010000000474974513,
"grad_norm": 0.9157342554933109,
"learning_rate": 6.822945986946385e-08,
"logits/chosen": 4.574192523956299,
"logits/rejected": 4.574192523956299,
"logps/chosen": -29500.658203125,
"logps/pi_response": -18083.599609375,
"logps/ref_response": -266.3084716796875,
"logps/rejected": -29500.658203125,
"loss": 0.5148,
"rewards/accuracies": 0.05384615436196327,
"rewards/chosen": -293.0876159667969,
"rewards/margins": 1.4085036355027114e-06,
"rewards/rejected": -293.0876159667969,
"step": 120
},
{
"epoch": 0.84,
"eta": 0.0010000000474974513,
"grad_norm": 0.932891201307216,
"learning_rate": 3.397296523427806e-08,
"logits/chosen": 4.706723213195801,
"logits/rejected": 4.706723213195801,
"logps/chosen": -26521.8828125,
"logps/pi_response": -15752.3818359375,
"logps/ref_response": -260.1748046875,
"logps/rejected": -26521.884765625,
"loss": 0.5218,
"rewards/accuracies": 0.04615384712815285,
"rewards/chosen": -263.5008544921875,
"rewards/margins": 8.876506853994215e-07,
"rewards/rejected": -263.5008544921875,
"step": 130
},
{
"epoch": 0.91,
"eta": 0.0010000000474974513,
"grad_norm": 0.7630353244282155,
"learning_rate": 1.1026475173977978e-08,
"logits/chosen": 4.662195205688477,
"logits/rejected": 4.662195205688477,
"logps/chosen": -29146.107421875,
"logps/pi_response": -18217.185546875,
"logps/ref_response": -266.84295654296875,
"logps/rejected": -29146.107421875,
"loss": 0.5144,
"rewards/accuracies": 0.03846153989434242,
"rewards/chosen": -289.6330261230469,
"rewards/margins": 5.868765029504175e-08,
"rewards/rejected": -289.6330261230469,
"step": 140
},
{
"epoch": 0.97,
"eta": 0.0010000000474974513,
"grad_norm": 0.7944892762549526,
"learning_rate": 5.913435276374834e-10,
"logits/chosen": 4.59439754486084,
"logits/rejected": 4.59439754486084,
"logps/chosen": -28846.330078125,
"logps/pi_response": -18351.314453125,
"logps/ref_response": -276.857666015625,
"logps/rejected": -28846.330078125,
"loss": 0.5191,
"rewards/accuracies": 0.03846153989434242,
"rewards/chosen": -286.6873779296875,
"rewards/margins": -8.803147579783399e-07,
"rewards/rejected": -286.6873779296875,
"step": 150
},
{
"epoch": 0.99,
"step": 153,
"total_flos": 0.0,
"train_loss": 0.0,
"train_runtime": 0.0094,
"train_samples_per_second": 2122892.066,
"train_steps_per_second": 16240.124
}
],
"logging_steps": 10,
"max_steps": 153,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 200,
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}