two_agent_1_dpo_iter_2 / trainer_state.json
YYYYYYibo's picture
Model save
87bda1e verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9945,
"eval_steps": 500,
"global_step": 153,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01,
"grad_norm": 14.538259596842082,
"learning_rate": 3.125e-08,
"logits/chosen": -1.485394835472107,
"logits/rejected": -1.5657753944396973,
"logps/chosen": -113.49234771728516,
"logps/rejected": -112.02042388916016,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.07,
"grad_norm": 13.777860556030118,
"learning_rate": 3.1249999999999997e-07,
"logits/chosen": -1.8860422372817993,
"logits/rejected": -1.8673903942108154,
"logps/chosen": -159.92677307128906,
"logps/rejected": -158.13575744628906,
"loss": 0.693,
"rewards/accuracies": 0.470085471868515,
"rewards/chosen": -0.0027076357509940863,
"rewards/margins": 0.0009333029738627374,
"rewards/rejected": -0.003640938550233841,
"step": 10
},
{
"epoch": 0.13,
"grad_norm": 15.832935585691502,
"learning_rate": 4.989490450759331e-07,
"logits/chosen": -1.714375615119934,
"logits/rejected": -1.6653386354446411,
"logps/chosen": -179.85107421875,
"logps/rejected": -182.88519287109375,
"loss": 0.692,
"rewards/accuracies": 0.5692307949066162,
"rewards/chosen": -0.14061911404132843,
"rewards/margins": 0.009414789266884327,
"rewards/rejected": -0.15003390610218048,
"step": 20
},
{
"epoch": 0.2,
"grad_norm": 17.171119399204557,
"learning_rate": 4.872270441827174e-07,
"logits/chosen": -1.6256521940231323,
"logits/rejected": -1.5526025295257568,
"logps/chosen": -186.2281036376953,
"logps/rejected": -183.31361389160156,
"loss": 0.6942,
"rewards/accuracies": 0.48461538553237915,
"rewards/chosen": -0.32728826999664307,
"rewards/margins": 0.015691382810473442,
"rewards/rejected": -0.34297963976860046,
"step": 30
},
{
"epoch": 0.26,
"grad_norm": 23.62795629433987,
"learning_rate": 4.6308512113530063e-07,
"logits/chosen": -1.723747968673706,
"logits/rejected": -1.7841739654541016,
"logps/chosen": -193.51380920410156,
"logps/rejected": -202.89682006835938,
"loss": 0.6861,
"rewards/accuracies": 0.5307692289352417,
"rewards/chosen": -0.2831575870513916,
"rewards/margins": 0.013270785100758076,
"rewards/rejected": -0.2964283227920532,
"step": 40
},
{
"epoch": 0.33,
"grad_norm": 15.77157102463667,
"learning_rate": 4.277872161641681e-07,
"logits/chosen": -1.7303481101989746,
"logits/rejected": -1.65773344039917,
"logps/chosen": -177.57931518554688,
"logps/rejected": -179.11293029785156,
"loss": 0.6898,
"rewards/accuracies": 0.5269230604171753,
"rewards/chosen": -0.18453820049762726,
"rewards/margins": 0.02953496389091015,
"rewards/rejected": -0.21407318115234375,
"step": 50
},
{
"epoch": 0.39,
"grad_norm": 16.13100802946847,
"learning_rate": 3.8318133624280046e-07,
"logits/chosen": -1.6843277215957642,
"logits/rejected": -1.6873857975006104,
"logps/chosen": -182.8292694091797,
"logps/rejected": -194.4746551513672,
"loss": 0.6849,
"rewards/accuracies": 0.5538461804389954,
"rewards/chosen": -0.19599129259586334,
"rewards/margins": 0.03240448608994484,
"rewards/rejected": -0.22839577496051788,
"step": 60
},
{
"epoch": 0.46,
"grad_norm": 16.445288992190534,
"learning_rate": 3.316028034595861e-07,
"logits/chosen": -1.6721850633621216,
"logits/rejected": -1.6687787771224976,
"logps/chosen": -195.88442993164062,
"logps/rejected": -199.55630493164062,
"loss": 0.6931,
"rewards/accuracies": 0.5153846144676208,
"rewards/chosen": -0.2889784276485443,
"rewards/margins": -0.0007830683025531471,
"rewards/rejected": -0.2881953716278076,
"step": 70
},
{
"epoch": 0.52,
"grad_norm": 12.960135155210429,
"learning_rate": 2.7575199021178855e-07,
"logits/chosen": -1.4989523887634277,
"logits/rejected": -1.4164642095565796,
"logps/chosen": -184.61236572265625,
"logps/rejected": -192.20050048828125,
"loss": 0.6833,
"rewards/accuracies": 0.5692307949066162,
"rewards/chosen": -0.22188612818717957,
"rewards/margins": 0.04173959046602249,
"rewards/rejected": -0.26362574100494385,
"step": 80
},
{
"epoch": 0.58,
"grad_norm": 15.682163034505669,
"learning_rate": 2.1855294234408068e-07,
"logits/chosen": -1.4179878234863281,
"logits/rejected": -1.4031846523284912,
"logps/chosen": -179.42979431152344,
"logps/rejected": -187.13311767578125,
"loss": 0.6809,
"rewards/accuracies": 0.5730769038200378,
"rewards/chosen": -0.23481449484825134,
"rewards/margins": 0.03382309526205063,
"rewards/rejected": -0.26863762736320496,
"step": 90
},
{
"epoch": 0.65,
"grad_norm": 19.094980323061744,
"learning_rate": 1.6300029195778453e-07,
"logits/chosen": -1.2222946882247925,
"logits/rejected": -1.3037612438201904,
"logps/chosen": -191.2831268310547,
"logps/rejected": -198.2336883544922,
"loss": 0.6865,
"rewards/accuracies": 0.5153846144676208,
"rewards/chosen": -0.3084586560726166,
"rewards/margins": 0.02169790491461754,
"rewards/rejected": -0.3301565647125244,
"step": 100
},
{
"epoch": 0.71,
"grad_norm": 19.896508891597843,
"learning_rate": 1.1200247470632392e-07,
"logits/chosen": -1.4282666444778442,
"logits/rejected": -1.4667084217071533,
"logps/chosen": -185.49362182617188,
"logps/rejected": -186.56646728515625,
"loss": 0.6928,
"rewards/accuracies": 0.5115384459495544,
"rewards/chosen": -0.26517340540885925,
"rewards/margins": 0.011802640743553638,
"rewards/rejected": -0.2769760489463806,
"step": 110
},
{
"epoch": 0.78,
"grad_norm": 16.948516897970652,
"learning_rate": 6.822945986946385e-08,
"logits/chosen": -1.1220929622650146,
"logits/rejected": -1.2350115776062012,
"logps/chosen": -189.1597137451172,
"logps/rejected": -199.32754516601562,
"loss": 0.6781,
"rewards/accuracies": 0.6307692527770996,
"rewards/chosen": -0.25235074758529663,
"rewards/margins": 0.046545807272195816,
"rewards/rejected": -0.29889652132987976,
"step": 120
},
{
"epoch": 0.84,
"grad_norm": 17.56220411035667,
"learning_rate": 3.397296523427806e-08,
"logits/chosen": -0.9844478368759155,
"logits/rejected": -1.0250178575515747,
"logps/chosen": -183.03431701660156,
"logps/rejected": -190.74391174316406,
"loss": 0.686,
"rewards/accuracies": 0.5192307829856873,
"rewards/chosen": -0.3220398724079132,
"rewards/margins": 0.022711992263793945,
"rewards/rejected": -0.34475192427635193,
"step": 130
},
{
"epoch": 0.91,
"grad_norm": 15.473766104576516,
"learning_rate": 1.1026475173977978e-08,
"logits/chosen": -1.361193299293518,
"logits/rejected": -1.2996608018875122,
"logps/chosen": -193.40740966796875,
"logps/rejected": -192.8422393798828,
"loss": 0.686,
"rewards/accuracies": 0.5384615659713745,
"rewards/chosen": -0.2967548072338104,
"rewards/margins": 0.024480195716023445,
"rewards/rejected": -0.3212350010871887,
"step": 140
},
{
"epoch": 0.97,
"grad_norm": 17.16639289762015,
"learning_rate": 5.913435276374834e-10,
"logits/chosen": -1.1994727849960327,
"logits/rejected": -1.2772407531738281,
"logps/chosen": -184.84437561035156,
"logps/rejected": -193.1089324951172,
"loss": 0.6824,
"rewards/accuracies": 0.5884615182876587,
"rewards/chosen": -0.3001053035259247,
"rewards/margins": 0.036925580352544785,
"rewards/rejected": -0.33703088760375977,
"step": 150
},
{
"epoch": 0.99,
"step": 153,
"total_flos": 0.0,
"train_loss": 0.6870396254109401,
"train_runtime": 21840.9373,
"train_samples_per_second": 0.916,
"train_steps_per_second": 0.007
}
],
"logging_steps": 10,
"max_steps": 153,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}