two_agent_1_rdpo_iter_2 / trainer_state.json
YYYYYYibo's picture
Model save
950b756 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9945,
"eval_steps": 500,
"global_step": 153,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01,
"eta": 0.0010000000474974513,
"grad_norm": 14.745374712777657,
"learning_rate": 3.125e-08,
"logits/chosen": -1.480354905128479,
"logits/rejected": -1.5607078075408936,
"logps/chosen": -113.47530364990234,
"logps/pi_response": -223.8134002685547,
"logps/ref_response": -223.8134002685547,
"logps/rejected": -112.02357482910156,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.07,
"eta": 0.0009999999310821295,
"grad_norm": 13.943252568407653,
"learning_rate": 3.1249999999999997e-07,
"logits/chosen": -1.861205816268921,
"logits/rejected": -1.8467286825180054,
"logps/chosen": -159.73291015625,
"logps/pi_response": -273.9164733886719,
"logps/ref_response": -273.0810852050781,
"logps/rejected": -158.10842895507812,
"loss": 0.693,
"rewards/accuracies": 0.46581196784973145,
"rewards/chosen": -0.0030446185264736414,
"rewards/margins": 0.0008683722116984427,
"rewards/rejected": -0.003912990912795067,
"step": 10
},
{
"epoch": 0.13,
"eta": 0.0010000000474974513,
"grad_norm": 14.636770074330482,
"learning_rate": 4.989490450759331e-07,
"logits/chosen": -1.6749669313430786,
"logits/rejected": -1.6249001026153564,
"logps/chosen": -181.44686889648438,
"logps/pi_response": -299.26806640625,
"logps/ref_response": -269.5531921386719,
"logps/rejected": -184.55027770996094,
"loss": 0.6921,
"rewards/accuracies": 0.5461538434028625,
"rewards/chosen": -0.15622131526470184,
"rewards/margins": 0.010419250465929508,
"rewards/rejected": -0.16664059460163116,
"step": 20
},
{
"epoch": 0.2,
"eta": 0.0010000000474974513,
"grad_norm": 18.84883262757341,
"learning_rate": 4.872270441827174e-07,
"logits/chosen": -1.5554673671722412,
"logits/rejected": -1.4832121133804321,
"logps/chosen": -187.2523956298828,
"logps/pi_response": -331.72064208984375,
"logps/ref_response": -270.0771484375,
"logps/rejected": -184.25772094726562,
"loss": 0.694,
"rewards/accuracies": 0.4961538314819336,
"rewards/chosen": -0.3374003469944,
"rewards/margins": 0.01481544878333807,
"rewards/rejected": -0.35221579670906067,
"step": 30
},
{
"epoch": 0.26,
"eta": 0.0010000000474974513,
"grad_norm": 19.234325247102298,
"learning_rate": 4.6308512113530063e-07,
"logits/chosen": -1.6449016332626343,
"logits/rejected": -1.708786129951477,
"logps/chosen": -188.21363830566406,
"logps/pi_response": -333.604736328125,
"logps/ref_response": -275.1395263671875,
"logps/rejected": -197.63232421875,
"loss": 0.6861,
"rewards/accuracies": 0.5269230604171753,
"rewards/chosen": -0.23007477819919586,
"rewards/margins": 0.013713112100958824,
"rewards/rejected": -0.24378788471221924,
"step": 40
},
{
"epoch": 0.33,
"eta": 0.0010000000474974513,
"grad_norm": 17.138306112556428,
"learning_rate": 4.277872161641681e-07,
"logits/chosen": -1.726422667503357,
"logits/rejected": -1.659047245979309,
"logps/chosen": -173.61680603027344,
"logps/pi_response": -315.35235595703125,
"logps/ref_response": -265.1530456542969,
"logps/rejected": -175.0764617919922,
"loss": 0.6886,
"rewards/accuracies": 0.5461538434028625,
"rewards/chosen": -0.1452452391386032,
"rewards/margins": 0.029706543311476707,
"rewards/rejected": -0.17495179176330566,
"step": 50
},
{
"epoch": 0.39,
"eta": 0.0010000000474974513,
"grad_norm": 15.457021507203054,
"learning_rate": 3.8318133624280046e-07,
"logits/chosen": -1.712626338005066,
"logits/rejected": -1.722040057182312,
"logps/chosen": -179.5359649658203,
"logps/pi_response": -311.54986572265625,
"logps/ref_response": -265.8667907714844,
"logps/rejected": -191.751708984375,
"loss": 0.6849,
"rewards/accuracies": 0.5653846263885498,
"rewards/chosen": -0.16358985006809235,
"rewards/margins": 0.03767317533493042,
"rewards/rejected": -0.20126302540302277,
"step": 60
},
{
"epoch": 0.46,
"eta": 0.0010000000474974513,
"grad_norm": 25.055768196503276,
"learning_rate": 3.316028034595861e-07,
"logits/chosen": -1.7020467519760132,
"logits/rejected": -1.7025904655456543,
"logps/chosen": -185.90699768066406,
"logps/pi_response": -315.3182067871094,
"logps/ref_response": -276.3799743652344,
"logps/rejected": -189.57199096679688,
"loss": 0.6889,
"rewards/accuracies": 0.5076923370361328,
"rewards/chosen": -0.18907414376735687,
"rewards/margins": -0.0005933608626946807,
"rewards/rejected": -0.18848079442977905,
"step": 70
},
{
"epoch": 0.52,
"eta": 0.0010000000474974513,
"grad_norm": 20.33794636752797,
"learning_rate": 2.7575199021178855e-07,
"logits/chosen": -1.3984944820404053,
"logits/rejected": -1.3162914514541626,
"logps/chosen": -193.8706817626953,
"logps/pi_response": -336.3244323730469,
"logps/ref_response": -272.45025634765625,
"logps/rejected": -203.4906005859375,
"loss": 0.6834,
"rewards/accuracies": 0.5692307949066162,
"rewards/chosen": -0.31467124819755554,
"rewards/margins": 0.06254380196332932,
"rewards/rejected": -0.37721511721611023,
"step": 80
},
{
"epoch": 0.58,
"eta": 0.0010000000474974513,
"grad_norm": 17.59187813851558,
"learning_rate": 2.1855294234408068e-07,
"logits/chosen": -1.2510021924972534,
"logits/rejected": -1.2377079725265503,
"logps/chosen": -209.71649169921875,
"logps/pi_response": -354.3184509277344,
"logps/ref_response": -265.3123474121094,
"logps/rejected": -218.32000732421875,
"loss": 0.686,
"rewards/accuracies": 0.5538461804389954,
"rewards/chosen": -0.5378236770629883,
"rewards/margins": 0.04319743812084198,
"rewards/rejected": -0.5810210704803467,
"step": 90
},
{
"epoch": 0.65,
"eta": 0.0010000000474974513,
"grad_norm": 15.830895593862458,
"learning_rate": 1.6300029195778453e-07,
"logits/chosen": -1.2602006196975708,
"logits/rejected": -1.3293911218643188,
"logps/chosen": -204.08714294433594,
"logps/pi_response": -333.2199401855469,
"logps/ref_response": -267.4874572753906,
"logps/rejected": -209.34811401367188,
"loss": 0.6896,
"rewards/accuracies": 0.5269230604171753,
"rewards/chosen": -0.438109815120697,
"rewards/margins": 0.006122402846813202,
"rewards/rejected": -0.44423219561576843,
"step": 100
},
{
"epoch": 0.71,
"eta": 0.0010000000474974513,
"grad_norm": 16.101030751126824,
"learning_rate": 1.1200247470632392e-07,
"logits/chosen": -1.4597405195236206,
"logits/rejected": -1.4979974031448364,
"logps/chosen": -195.9895782470703,
"logps/pi_response": -342.63812255859375,
"logps/ref_response": -289.2621765136719,
"logps/rejected": -196.60757446289062,
"loss": 0.6909,
"rewards/accuracies": 0.5307692289352417,
"rewards/chosen": -0.37045371532440186,
"rewards/margins": 0.006542083341628313,
"rewards/rejected": -0.3769958019256592,
"step": 110
},
{
"epoch": 0.78,
"eta": 0.0010000000474974513,
"grad_norm": 14.757730514557847,
"learning_rate": 6.822945986946385e-08,
"logits/chosen": -1.2902206182479858,
"logits/rejected": -1.3999152183532715,
"logps/chosen": -195.00039672851562,
"logps/pi_response": -312.9175720214844,
"logps/ref_response": -265.36669921875,
"logps/rejected": -204.87124633789062,
"loss": 0.6806,
"rewards/accuracies": 0.6153846383094788,
"rewards/chosen": -0.32740989327430725,
"rewards/margins": 0.04114748165011406,
"rewards/rejected": -0.368557333946228,
"step": 120
},
{
"epoch": 0.84,
"eta": 0.0010000000474974513,
"grad_norm": 15.906731544868482,
"learning_rate": 3.397296523427806e-08,
"logits/chosen": -1.066477656364441,
"logits/rejected": -1.102446436882019,
"logps/chosen": -194.1769561767578,
"logps/pi_response": -325.8312683105469,
"logps/ref_response": -258.591552734375,
"logps/rejected": -202.23509216308594,
"loss": 0.682,
"rewards/accuracies": 0.5153846144676208,
"rewards/chosen": -0.433013916015625,
"rewards/margins": 0.026987465098500252,
"rewards/rejected": -0.4600013792514801,
"step": 130
},
{
"epoch": 0.91,
"eta": 0.0010000000474974513,
"grad_norm": 14.158010485809152,
"learning_rate": 1.1026475173977978e-08,
"logits/chosen": -1.430467963218689,
"logits/rejected": -1.3737958669662476,
"logps/chosen": -206.46617126464844,
"logps/pi_response": -334.0456848144531,
"logps/ref_response": -264.9248352050781,
"logps/rejected": -205.56634521484375,
"loss": 0.6856,
"rewards/accuracies": 0.5269230604171753,
"rewards/chosen": -0.42677730321884155,
"rewards/margins": 0.021916242316365242,
"rewards/rejected": -0.44869354367256165,
"step": 140
},
{
"epoch": 0.97,
"eta": 0.0010000000474974513,
"grad_norm": 15.33322379707595,
"learning_rate": 5.913435276374834e-10,
"logits/chosen": -1.2695854902267456,
"logits/rejected": -1.342061996459961,
"logps/chosen": -197.76773071289062,
"logps/pi_response": -341.5782165527344,
"logps/ref_response": -275.605224609375,
"logps/rejected": -205.83673095703125,
"loss": 0.681,
"rewards/accuracies": 0.5730769038200378,
"rewards/chosen": -0.42948493361473083,
"rewards/margins": 0.03518623486161232,
"rewards/rejected": -0.46467119455337524,
"step": 150
},
{
"epoch": 0.99,
"step": 153,
"total_flos": 0.0,
"train_loss": 0.687004194540136,
"train_runtime": 23329.5349,
"train_samples_per_second": 0.857,
"train_steps_per_second": 0.007
}
],
"logging_steps": 10,
"max_steps": 153,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}