two_agent_2_rdpo_iter_3 / trainer_state.json
YYYYYYibo's picture
Model save
fe31a2a verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9962157048249763,
"eval_steps": 500,
"global_step": 162,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01,
"eta": 0.0010000000474974513,
"grad_norm": 191.35738502801618,
"learning_rate": 2.941176470588235e-08,
"logits/chosen": 4.0547356605529785,
"logits/rejected": 3.9409475326538086,
"logps/chosen": -31881.79296875,
"logps/pi_response": -18460.900390625,
"logps/ref_response": -18460.900390625,
"logps/rejected": -32645.6484375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.06,
"eta": 0.0009999999310821295,
"grad_norm": 268.63004507915036,
"learning_rate": 2.941176470588235e-07,
"logits/chosen": 3.992137908935547,
"logits/rejected": 3.936304807662964,
"logps/chosen": -32467.201171875,
"logps/pi_response": -19245.36328125,
"logps/ref_response": -19250.427734375,
"logps/rejected": -32452.345703125,
"loss": 0.6758,
"rewards/accuracies": 0.26923078298568726,
"rewards/chosen": -0.5232506990432739,
"rewards/margins": 0.004823178984224796,
"rewards/rejected": -0.5280739068984985,
"step": 10
},
{
"epoch": 0.12,
"eta": 0.0010000000474974513,
"grad_norm": 292.31335814293095,
"learning_rate": 4.99472085783721e-07,
"logits/chosen": 3.628862142562866,
"logits/rejected": 3.504181146621704,
"logps/chosen": -33230.75,
"logps/pi_response": -17491.447265625,
"logps/ref_response": -17562.1328125,
"logps/rejected": -32794.78125,
"loss": 0.7567,
"rewards/accuracies": 0.5269230604171753,
"rewards/chosen": 0.06102239713072777,
"rewards/margins": 1.0328426361083984,
"rewards/rejected": -0.9718202948570251,
"step": 20
},
{
"epoch": 0.18,
"eta": 0.0010000000474974513,
"grad_norm": 325.0539602068029,
"learning_rate": 4.901488388458247e-07,
"logits/chosen": 3.735680103302002,
"logits/rejected": 3.570699453353882,
"logps/chosen": -31700.63671875,
"logps/pi_response": -18340.26953125,
"logps/ref_response": -18427.099609375,
"logps/rejected": -32393.19921875,
"loss": 0.7141,
"rewards/accuracies": 0.4769230782985687,
"rewards/chosen": 0.9040184617042542,
"rewards/margins": 1.0169713497161865,
"rewards/rejected": -0.11295279860496521,
"step": 30
},
{
"epoch": 0.25,
"eta": 0.0010000000474974513,
"grad_norm": 527.7409272120059,
"learning_rate": 4.695964991097616e-07,
"logits/chosen": 3.895160436630249,
"logits/rejected": 3.7793385982513428,
"logps/chosen": -32656.44921875,
"logps/pi_response": -18499.296875,
"logps/ref_response": -18572.958984375,
"logps/rejected": -32907.3125,
"loss": 0.6952,
"rewards/accuracies": 0.4769230782985687,
"rewards/chosen": 0.636614978313446,
"rewards/margins": 0.6588320732116699,
"rewards/rejected": -0.022217150777578354,
"step": 40
},
{
"epoch": 0.31,
"eta": 0.0010000000474974513,
"grad_norm": 253.1576881360588,
"learning_rate": 4.3877607113930516e-07,
"logits/chosen": 3.648803949356079,
"logits/rejected": 3.5239527225494385,
"logps/chosen": -32680.76953125,
"logps/pi_response": -17495.150390625,
"logps/ref_response": -17521.41015625,
"logps/rejected": -32909.08984375,
"loss": 0.6422,
"rewards/accuracies": 0.42692306637763977,
"rewards/chosen": -0.40684789419174194,
"rewards/margins": 0.6205180287361145,
"rewards/rejected": -1.027365803718567,
"step": 50
},
{
"epoch": 0.37,
"eta": 0.0010000000474974513,
"grad_norm": 320.74960989731557,
"learning_rate": 3.991286838919086e-07,
"logits/chosen": 3.655803918838501,
"logits/rejected": 3.5389180183410645,
"logps/chosen": -33417.07421875,
"logps/pi_response": -18680.818359375,
"logps/ref_response": -18699.423828125,
"logps/rejected": -32173.2578125,
"loss": 0.6291,
"rewards/accuracies": 0.5230769515037537,
"rewards/chosen": -0.396637499332428,
"rewards/margins": 0.7289350032806396,
"rewards/rejected": -1.1255724430084229,
"step": 60
},
{
"epoch": 0.43,
"eta": 0.0010000000474974513,
"grad_norm": 181.97105487380267,
"learning_rate": 3.52508205130354e-07,
"logits/chosen": 3.8402271270751953,
"logits/rejected": 3.7171554565429688,
"logps/chosen": -32983.04296875,
"logps/pi_response": -18114.55859375,
"logps/ref_response": -18124.4375,
"logps/rejected": -32464.29296875,
"loss": 0.6053,
"rewards/accuracies": 0.4615384638309479,
"rewards/chosen": -0.46990343928337097,
"rewards/margins": 0.38820669054985046,
"rewards/rejected": -0.858110249042511,
"step": 70
},
{
"epoch": 0.49,
"eta": 0.0010000000474974513,
"grad_norm": 152.1361225175544,
"learning_rate": 3.010945566265912e-07,
"logits/chosen": 3.8307697772979736,
"logits/rejected": 3.6418731212615967,
"logps/chosen": -33045.02734375,
"logps/pi_response": -18411.818359375,
"logps/ref_response": -18399.951171875,
"logps/rejected": -32295.091796875,
"loss": 0.5919,
"rewards/accuracies": 0.557692289352417,
"rewards/chosen": -0.7943554520606995,
"rewards/margins": 0.64806067943573,
"rewards/rejected": -1.4424160718917847,
"step": 80
},
{
"epoch": 0.55,
"eta": 0.0010000000474974513,
"grad_norm": 254.93088711947533,
"learning_rate": 2.4729178344249006e-07,
"logits/chosen": 3.744925022125244,
"logits/rejected": 3.602116107940674,
"logps/chosen": -32183.412109375,
"logps/pi_response": -18219.287109375,
"logps/ref_response": -18195.8125,
"logps/rejected": -32723.04296875,
"loss": 0.5946,
"rewards/accuracies": 0.5307692289352417,
"rewards/chosen": -0.8943226933479309,
"rewards/margins": 0.6636541485786438,
"rewards/rejected": -1.5579768419265747,
"step": 90
},
{
"epoch": 0.61,
"eta": 0.0010000000474974513,
"grad_norm": 155.63742344555874,
"learning_rate": 1.9361564345465145e-07,
"logits/chosen": 3.838331460952759,
"logits/rejected": 3.709167003631592,
"logps/chosen": -33436.125,
"logps/pi_response": -16940.001953125,
"logps/ref_response": -16922.8671875,
"logps/rejected": -32356.5234375,
"loss": 0.6018,
"rewards/accuracies": 0.48846152424812317,
"rewards/chosen": -0.7997922301292419,
"rewards/margins": 0.4609057307243347,
"rewards/rejected": -1.2606979608535767,
"step": 100
},
{
"epoch": 0.68,
"eta": 0.0010000000474974513,
"grad_norm": 89.01442356583244,
"learning_rate": 1.4257597331216208e-07,
"logits/chosen": 3.98964524269104,
"logits/rejected": 3.8796193599700928,
"logps/chosen": -32653.2890625,
"logps/pi_response": -17906.2265625,
"logps/ref_response": -17896.880859375,
"logps/rejected": -32700.134765625,
"loss": 0.6,
"rewards/accuracies": 0.4923076927661896,
"rewards/chosen": -0.20051293075084686,
"rewards/margins": 0.46511974930763245,
"rewards/rejected": -0.6656327247619629,
"step": 110
},
{
"epoch": 0.74,
"eta": 0.0010000000474974513,
"grad_norm": 83.3705719230307,
"learning_rate": 9.655933126436563e-08,
"logits/chosen": 3.9262661933898926,
"logits/rejected": 3.8268930912017822,
"logps/chosen": -32867.8359375,
"logps/pi_response": -17454.138671875,
"logps/ref_response": -17433.240234375,
"logps/rejected": -32855.66015625,
"loss": 0.5881,
"rewards/accuracies": 0.5192307829856873,
"rewards/chosen": -0.674457311630249,
"rewards/margins": 0.4932273328304291,
"rewards/rejected": -1.167684555053711,
"step": 120
},
{
"epoch": 0.8,
"eta": 0.0010000000474974513,
"grad_norm": 74.20493318211953,
"learning_rate": 5.771740434959277e-08,
"logits/chosen": 3.9484238624572754,
"logits/rejected": 3.8283846378326416,
"logps/chosen": -33543.33984375,
"logps/pi_response": -18303.177734375,
"logps/ref_response": -18285.470703125,
"logps/rejected": -32467.873046875,
"loss": 0.5825,
"rewards/accuracies": 0.5384615659713745,
"rewards/chosen": -0.9354388117790222,
"rewards/margins": 0.5288434624671936,
"rewards/rejected": -1.4642821550369263,
"step": 130
},
{
"epoch": 0.86,
"eta": 0.0010000000474974513,
"grad_norm": 214.3061926146843,
"learning_rate": 2.7866397900677185e-08,
"logits/chosen": 3.9201173782348633,
"logits/rejected": 3.743128538131714,
"logps/chosen": -32812.75390625,
"logps/pi_response": -17746.27734375,
"logps/ref_response": -17724.193359375,
"logps/rejected": -32488.669921875,
"loss": 0.5732,
"rewards/accuracies": 0.5461538434028625,
"rewards/chosen": -1.3605297803878784,
"rewards/margins": 0.7671653032302856,
"rewards/rejected": -2.127694845199585,
"step": 140
},
{
"epoch": 0.92,
"eta": 0.0010000000474974513,
"grad_norm": 233.45352677814805,
"learning_rate": 8.402111802159412e-09,
"logits/chosen": 3.8903307914733887,
"logits/rejected": 3.79535174369812,
"logps/chosen": -32934.5390625,
"logps/pi_response": -18709.8828125,
"logps/ref_response": -18687.2265625,
"logps/rejected": -33072.52734375,
"loss": 0.5728,
"rewards/accuracies": 0.48076921701431274,
"rewards/chosen": -0.8628613948822021,
"rewards/margins": 0.49873629212379456,
"rewards/rejected": -1.3615975379943848,
"step": 150
},
{
"epoch": 0.98,
"eta": 0.0010000000474974513,
"grad_norm": 86.80955639874749,
"learning_rate": 2.3467443900582197e-10,
"logits/chosen": 3.8695499897003174,
"logits/rejected": 3.724947929382324,
"logps/chosen": -32277.80859375,
"logps/pi_response": -15952.685546875,
"logps/ref_response": -15933.2509765625,
"logps/rejected": -32813.515625,
"loss": 0.5703,
"rewards/accuracies": 0.5384615659713745,
"rewards/chosen": -0.6783939003944397,
"rewards/margins": 0.6465076208114624,
"rewards/rejected": -1.3249014616012573,
"step": 160
},
{
"epoch": 1.0,
"step": 162,
"total_flos": 0.0,
"train_loss": 0.6241708625981837,
"train_runtime": 25355.9873,
"train_samples_per_second": 0.834,
"train_steps_per_second": 0.006
}
],
"logging_steps": 10,
"max_steps": 162,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 200,
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}