MNLP_M3_dpo_model / trainer_state.json
tocico28's picture
Upload folder using huggingface_hub
0f78d79 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.997458513978173,
"eval_steps": 200,
"global_step": 2508,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02989983555090447,
"grad_norm": 78.5,
"learning_rate": 4.780876494023904e-07,
"logits/chosen": -0.8346603512763977,
"logits/rejected": -0.5625396966934204,
"logps/chosen": -311.11248779296875,
"logps/rejected": -290.71624755859375,
"loss": 0.6974,
"rewards/accuracies": 0.3199999928474426,
"rewards/chosen": -0.005879516713321209,
"rewards/margins": -0.0028140258509665728,
"rewards/rejected": -0.003072815015912056,
"step": 25
},
{
"epoch": 0.05979967110180894,
"grad_norm": 112.5,
"learning_rate": 9.760956175298805e-07,
"logits/chosen": -0.8477816581726074,
"logits/rejected": -0.5839244723320007,
"logps/chosen": -341.1449890136719,
"logps/rejected": -303.2749938964844,
"loss": 0.6939,
"rewards/accuracies": 0.33500000834465027,
"rewards/chosen": -0.01889648474752903,
"rewards/margins": 0.0013772583333775401,
"rewards/rejected": -0.020271606743335724,
"step": 50
},
{
"epoch": 0.08969950665271341,
"grad_norm": 89.5,
"learning_rate": 1.4741035856573708e-06,
"logits/chosen": -0.7348077893257141,
"logits/rejected": -0.419241338968277,
"logps/chosen": -311.4237365722656,
"logps/rejected": -284.5274963378906,
"loss": 0.7,
"rewards/accuracies": 0.28999999165534973,
"rewards/chosen": -0.020579833537340164,
"rewards/margins": -0.008827819488942623,
"rewards/rejected": -0.011761474423110485,
"step": 75
},
{
"epoch": 0.11959934220361788,
"grad_norm": 89.5,
"learning_rate": 1.9721115537848607e-06,
"logits/chosen": -0.9120362997055054,
"logits/rejected": -0.566675066947937,
"logps/chosen": -322.989990234375,
"logps/rejected": -276.8037414550781,
"loss": 0.6868,
"rewards/accuracies": 0.3675000071525574,
"rewards/chosen": -0.027477417141199112,
"rewards/margins": 0.018669739365577698,
"rewards/rejected": -0.04612060636281967,
"step": 100
},
{
"epoch": 0.14949917775452234,
"grad_norm": 83.0,
"learning_rate": 2.470119521912351e-06,
"logits/chosen": -0.8410671353340149,
"logits/rejected": -0.43034911155700684,
"logps/chosen": -297.4024963378906,
"logps/rejected": -304.4224853515625,
"loss": 0.6832,
"rewards/accuracies": 0.36000001430511475,
"rewards/chosen": -0.05832824856042862,
"rewards/margins": 0.02584075927734375,
"rewards/rejected": -0.08419036865234375,
"step": 125
},
{
"epoch": 0.17939901330542682,
"grad_norm": 106.5,
"learning_rate": 2.968127490039841e-06,
"logits/chosen": -0.9279866814613342,
"logits/rejected": -0.6811022758483887,
"logps/chosen": -312.67498779296875,
"logps/rejected": -285.7799987792969,
"loss": 0.6709,
"rewards/accuracies": 0.49000000953674316,
"rewards/chosen": -0.07547790557146072,
"rewards/margins": 0.056133728474378586,
"rewards/rejected": -0.1316046118736267,
"step": 150
},
{
"epoch": 0.2092988488563313,
"grad_norm": 96.0,
"learning_rate": 3.466135458167331e-06,
"logits/chosen": -0.8703573346138,
"logits/rejected": -0.5601403713226318,
"logps/chosen": -323.947509765625,
"logps/rejected": -292.8074951171875,
"loss": 0.6696,
"rewards/accuracies": 0.4950000047683716,
"rewards/chosen": -0.11684814095497131,
"rewards/margins": 0.06319641321897507,
"rewards/rejected": -0.1800549328327179,
"step": 175
},
{
"epoch": 0.23919868440723577,
"grad_norm": 99.0,
"learning_rate": 3.9641434262948205e-06,
"logits/chosen": -0.9258654713630676,
"logits/rejected": -0.5686477422714233,
"logps/chosen": -328.7449951171875,
"logps/rejected": -316.5574951171875,
"loss": 0.6579,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.1883123815059662,
"rewards/margins": 0.09867187589406967,
"rewards/rejected": -0.28693297505378723,
"step": 200
},
{
"epoch": 0.23919868440723577,
"eval_logits/chosen": -0.9216321706771851,
"eval_logits/rejected": -0.7277408838272095,
"eval_logps/chosen": -320.7849426269531,
"eval_logps/rejected": -293.8709716796875,
"eval_loss": 0.6465986371040344,
"eval_rewards/accuracies": 0.560387909412384,
"eval_rewards/chosen": -0.19119606912136078,
"eval_rewards/margins": 0.1261032223701477,
"eval_rewards/rejected": -0.31729716062545776,
"eval_runtime": 877.9315,
"eval_samples_per_second": 1.694,
"eval_steps_per_second": 0.212,
"step": 200
},
{
"epoch": 0.2690985199581402,
"grad_norm": 87.0,
"learning_rate": 4.462151394422311e-06,
"logits/chosen": -0.8007558584213257,
"logits/rejected": -0.505867600440979,
"logps/chosen": -320.7512512207031,
"logps/rejected": -311.8299865722656,
"loss": 0.6444,
"rewards/accuracies": 0.5649999976158142,
"rewards/chosen": -0.2540551722049713,
"rewards/margins": 0.14147095382213593,
"rewards/rejected": -0.3954962193965912,
"step": 225
},
{
"epoch": 0.2989983555090447,
"grad_norm": 96.5,
"learning_rate": 4.960159362549802e-06,
"logits/chosen": -0.9090196490287781,
"logits/rejected": -0.6456773281097412,
"logps/chosen": -323.7200012207031,
"logps/rejected": -295.2149963378906,
"loss": 0.6255,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.2805468738079071,
"rewards/margins": 0.19930054247379303,
"rewards/rejected": -0.47991272807121277,
"step": 250
},
{
"epoch": 0.32889819105994916,
"grad_norm": 91.0,
"learning_rate": 4.9490474080638015e-06,
"logits/chosen": -0.9534767270088196,
"logits/rejected": -0.6329247951507568,
"logps/chosen": -319.1549987792969,
"logps/rejected": -283.88751220703125,
"loss": 0.6192,
"rewards/accuracies": 0.5924999713897705,
"rewards/chosen": -0.29086607694625854,
"rewards/margins": 0.23339904844760895,
"rewards/rejected": -0.5240704417228699,
"step": 275
},
{
"epoch": 0.35879802661085364,
"grad_norm": 70.5,
"learning_rate": 4.8936641559592385e-06,
"logits/chosen": -0.9436456561088562,
"logits/rejected": -0.7789434790611267,
"logps/chosen": -349.5050048828125,
"logps/rejected": -310.48748779296875,
"loss": 0.627,
"rewards/accuracies": 0.6349999904632568,
"rewards/chosen": -0.30020782351493835,
"rewards/margins": 0.23243407905101776,
"rewards/rejected": -0.532727062702179,
"step": 300
},
{
"epoch": 0.3886978621617581,
"grad_norm": 101.0,
"learning_rate": 4.838280903854675e-06,
"logits/chosen": -0.9607565402984619,
"logits/rejected": -0.7166936993598938,
"logps/chosen": -317.0874938964844,
"logps/rejected": -289.0824890136719,
"loss": 0.5906,
"rewards/accuracies": 0.6524999737739563,
"rewards/chosen": -0.4176098704338074,
"rewards/margins": 0.3300067186355591,
"rewards/rejected": -0.7473974823951721,
"step": 325
},
{
"epoch": 0.4185976977126626,
"grad_norm": 94.0,
"learning_rate": 4.782897651750112e-06,
"logits/chosen": -0.9818115234375,
"logits/rejected": -0.6833120584487915,
"logps/chosen": -321.1875,
"logps/rejected": -316.58624267578125,
"loss": 0.577,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.4978076219558716,
"rewards/margins": 0.39054566621780396,
"rewards/rejected": -0.8884375095367432,
"step": 350
},
{
"epoch": 0.44849753326356706,
"grad_norm": 83.5,
"learning_rate": 4.727514399645548e-06,
"logits/chosen": -1.0211011171340942,
"logits/rejected": -0.7218142747879028,
"logps/chosen": -307.9674987792969,
"logps/rejected": -288.7850036621094,
"loss": 0.5544,
"rewards/accuracies": 0.6974999904632568,
"rewards/chosen": -0.4097009301185608,
"rewards/margins": 0.4377111792564392,
"rewards/rejected": -0.8475390672683716,
"step": 375
},
{
"epoch": 0.47839736881447154,
"grad_norm": 77.5,
"learning_rate": 4.672131147540984e-06,
"logits/chosen": -0.9680676460266113,
"logits/rejected": -0.7582107782363892,
"logps/chosen": -337.9375,
"logps/rejected": -313.7749938964844,
"loss": 0.5977,
"rewards/accuracies": 0.6549999713897705,
"rewards/chosen": -0.5489477515220642,
"rewards/margins": 0.35999757051467896,
"rewards/rejected": -0.9089636206626892,
"step": 400
},
{
"epoch": 0.47839736881447154,
"eval_logits/chosen": -1.041106939315796,
"eval_logits/rejected": -0.8698605895042419,
"eval_logps/chosen": -323.7284851074219,
"eval_logps/rejected": -299.6156005859375,
"eval_loss": 0.5722406506538391,
"eval_rewards/accuracies": 0.6610022783279419,
"eval_rewards/chosen": -0.4932539761066437,
"eval_rewards/margins": 0.40423059463500977,
"eval_rewards/rejected": -0.8973480463027954,
"eval_runtime": 876.344,
"eval_samples_per_second": 1.697,
"eval_steps_per_second": 0.212,
"step": 400
},
{
"epoch": 0.508297204365376,
"grad_norm": 84.0,
"learning_rate": 4.61674789543642e-06,
"logits/chosen": -1.2390661239624023,
"logits/rejected": -0.9836773872375488,
"logps/chosen": -328.1875,
"logps/rejected": -317.32501220703125,
"loss": 0.5527,
"rewards/accuracies": 0.6675000190734863,
"rewards/chosen": -0.6254773139953613,
"rewards/margins": 0.5287072658538818,
"rewards/rejected": -1.153835415840149,
"step": 425
},
{
"epoch": 0.5381970399162804,
"grad_norm": 93.0,
"learning_rate": 4.561364643331857e-06,
"logits/chosen": -1.0737494230270386,
"logits/rejected": -0.8683199882507324,
"logps/chosen": -316.00250244140625,
"logps/rejected": -295.9649963378906,
"loss": 0.5736,
"rewards/accuracies": 0.6600000262260437,
"rewards/chosen": -0.539447009563446,
"rewards/margins": 0.46495360136032104,
"rewards/rejected": -1.0048657655715942,
"step": 450
},
{
"epoch": 0.5680968754671849,
"grad_norm": 73.5,
"learning_rate": 4.505981391227293e-06,
"logits/chosen": -1.052968144416809,
"logits/rejected": -0.7523078322410583,
"logps/chosen": -318.50250244140625,
"logps/rejected": -313.8175048828125,
"loss": 0.5422,
"rewards/accuracies": 0.7149999737739563,
"rewards/chosen": -0.5196704268455505,
"rewards/margins": 0.5570727586746216,
"rewards/rejected": -1.0764819383621216,
"step": 475
},
{
"epoch": 0.5979967110180894,
"grad_norm": 70.0,
"learning_rate": 4.4505981391227295e-06,
"logits/chosen": -1.1461485624313354,
"logits/rejected": -0.9354357719421387,
"logps/chosen": -324.4750061035156,
"logps/rejected": -294.0775146484375,
"loss": 0.5415,
"rewards/accuracies": 0.7074999809265137,
"rewards/chosen": -0.518980085849762,
"rewards/margins": 0.5734081864356995,
"rewards/rejected": -1.092441439628601,
"step": 500
},
{
"epoch": 0.6278965465689939,
"grad_norm": 84.0,
"learning_rate": 4.395214887018166e-06,
"logits/chosen": -1.091801404953003,
"logits/rejected": -0.8006445169448853,
"logps/chosen": -323.1724853515625,
"logps/rejected": -294.4674987792969,
"loss": 0.5646,
"rewards/accuracies": 0.6700000166893005,
"rewards/chosen": -0.672253429889679,
"rewards/margins": 0.5069983005523682,
"rewards/rejected": -1.1792798042297363,
"step": 525
},
{
"epoch": 0.6577963821198983,
"grad_norm": 95.0,
"learning_rate": 4.339831634913603e-06,
"logits/chosen": -1.220596194267273,
"logits/rejected": -0.9236291646957397,
"logps/chosen": -316.7950134277344,
"logps/rejected": -302.0824890136719,
"loss": 0.5178,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.7468109130859375,
"rewards/margins": 0.6105853319168091,
"rewards/rejected": -1.3566796779632568,
"step": 550
},
{
"epoch": 0.6876962176708028,
"grad_norm": 100.0,
"learning_rate": 4.284448382809039e-06,
"logits/chosen": -1.0421770811080933,
"logits/rejected": -0.7285050749778748,
"logps/chosen": -308.42498779296875,
"logps/rejected": -269.7037353515625,
"loss": 0.5448,
"rewards/accuracies": 0.6850000023841858,
"rewards/chosen": -0.7317401170730591,
"rewards/margins": 0.5794018507003784,
"rewards/rejected": -1.3115381002426147,
"step": 575
},
{
"epoch": 0.7175960532217073,
"grad_norm": 97.5,
"learning_rate": 4.229065130704476e-06,
"logits/chosen": -1.1298235654830933,
"logits/rejected": -0.7811802625656128,
"logps/chosen": -322.0574951171875,
"logps/rejected": -309.9750061035156,
"loss": 0.5292,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.590954601764679,
"rewards/margins": 0.6085253953933716,
"rewards/rejected": -1.1989331245422363,
"step": 600
},
{
"epoch": 0.7175960532217073,
"eval_logits/chosen": -1.078187346458435,
"eval_logits/rejected": -0.9206746220588684,
"eval_logps/chosen": -324.5967712402344,
"eval_logps/rejected": -301.7204284667969,
"eval_loss": 0.5492891669273376,
"eval_rewards/accuracies": 0.6757872104644775,
"eval_rewards/chosen": -0.5633505582809448,
"eval_rewards/margins": 0.5408346652984619,
"eval_rewards/rejected": -1.1038333177566528,
"eval_runtime": 876.4047,
"eval_samples_per_second": 1.697,
"eval_steps_per_second": 0.212,
"step": 600
},
{
"epoch": 0.7474958887726117,
"grad_norm": 87.5,
"learning_rate": 4.173681878599912e-06,
"logits/chosen": -1.1809699535369873,
"logits/rejected": -0.8887664675712585,
"logps/chosen": -303.6575012207031,
"logps/rejected": -294.7774963378906,
"loss": 0.5261,
"rewards/accuracies": 0.7275000214576721,
"rewards/chosen": -0.5871319770812988,
"rewards/margins": 0.6293676495552063,
"rewards/rejected": -1.2162939310073853,
"step": 625
},
{
"epoch": 0.7773957243235162,
"grad_norm": 99.5,
"learning_rate": 4.118298626495348e-06,
"logits/chosen": -1.1009465456008911,
"logits/rejected": -0.9342904686927795,
"logps/chosen": -338.12750244140625,
"logps/rejected": -318.96624755859375,
"loss": 0.5603,
"rewards/accuracies": 0.6850000023841858,
"rewards/chosen": -0.714611828327179,
"rewards/margins": 0.6232568621635437,
"rewards/rejected": -1.3377538919448853,
"step": 650
},
{
"epoch": 0.8072955598744207,
"grad_norm": 72.5,
"learning_rate": 4.062915374390784e-06,
"logits/chosen": -1.2523653507232666,
"logits/rejected": -1.0046355724334717,
"logps/chosen": -310.9049987792969,
"logps/rejected": -297.67498779296875,
"loss": 0.5135,
"rewards/accuracies": 0.7099999785423279,
"rewards/chosen": -0.7437072992324829,
"rewards/margins": 0.6859521269798279,
"rewards/rejected": -1.4290771484375,
"step": 675
},
{
"epoch": 0.8371953954253252,
"grad_norm": 89.0,
"learning_rate": 4.007532122286221e-06,
"logits/chosen": -1.2401965856552124,
"logits/rejected": -0.8460285663604736,
"logps/chosen": -336.927490234375,
"logps/rejected": -318.7799987792969,
"loss": 0.5186,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.7741259932518005,
"rewards/margins": 0.7083032131195068,
"rewards/rejected": -1.4823095798492432,
"step": 700
},
{
"epoch": 0.8670952309762296,
"grad_norm": 78.0,
"learning_rate": 3.9521488701816575e-06,
"logits/chosen": -1.1703033447265625,
"logits/rejected": -0.9548498392105103,
"logps/chosen": -287.87249755859375,
"logps/rejected": -300.864990234375,
"loss": 0.5476,
"rewards/accuracies": 0.6825000047683716,
"rewards/chosen": -0.8389843702316284,
"rewards/margins": 0.608197033405304,
"rewards/rejected": -1.447534203529358,
"step": 725
},
{
"epoch": 0.8969950665271341,
"grad_norm": 100.5,
"learning_rate": 3.896765618077094e-06,
"logits/chosen": -1.1477763652801514,
"logits/rejected": -0.9038227796554565,
"logps/chosen": -338.31500244140625,
"logps/rejected": -319.9649963378906,
"loss": 0.5148,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.8131677508354187,
"rewards/margins": 0.7464379668235779,
"rewards/rejected": -1.559140682220459,
"step": 750
},
{
"epoch": 0.9268949020780386,
"grad_norm": 92.0,
"learning_rate": 3.84138236597253e-06,
"logits/chosen": -1.2342950105667114,
"logits/rejected": -0.946718156337738,
"logps/chosen": -331.1512451171875,
"logps/rejected": -304.0249938964844,
"loss": 0.528,
"rewards/accuracies": 0.7149999737739563,
"rewards/chosen": -0.9154602289199829,
"rewards/margins": 0.6957080364227295,
"rewards/rejected": -1.6108520030975342,
"step": 775
},
{
"epoch": 0.9567947376289431,
"grad_norm": 102.0,
"learning_rate": 3.7859991138679664e-06,
"logits/chosen": -1.0906939506530762,
"logits/rejected": -0.9649511575698853,
"logps/chosen": -338.5637512207031,
"logps/rejected": -338.4674987792969,
"loss": 0.5151,
"rewards/accuracies": 0.7200000286102295,
"rewards/chosen": -0.859545886516571,
"rewards/margins": 0.7704944014549255,
"rewards/rejected": -1.630163550376892,
"step": 800
},
{
"epoch": 0.9567947376289431,
"eval_logits/chosen": -1.1360965967178345,
"eval_logits/rejected": -0.9822049736976624,
"eval_logps/chosen": -326.69891357421875,
"eval_logps/rejected": -305.0,
"eval_loss": 0.5390191674232483,
"eval_rewards/accuracies": 0.687980055809021,
"eval_rewards/chosen": -0.7810032367706299,
"eval_rewards/margins": 0.6442182064056396,
"eval_rewards/rejected": -1.4252588748931885,
"eval_runtime": 876.4063,
"eval_samples_per_second": 1.697,
"eval_steps_per_second": 0.212,
"step": 800
},
{
"epoch": 0.9866945731798475,
"grad_norm": 84.5,
"learning_rate": 3.730615861763403e-06,
"logits/chosen": -1.2244549989700317,
"logits/rejected": NaN,
"logps/chosen": -334.5425109863281,
"logps/rejected": -339.23748779296875,
"loss": 0.5275,
"rewards/accuracies": 0.7149999737739563,
"rewards/chosen": -0.8379321098327637,
"rewards/margins": 0.715624988079071,
"rewards/rejected": -1.554010033607483,
"step": 825
},
{
"epoch": 1.0155479144864703,
"grad_norm": 57.25,
"learning_rate": 3.675232609658839e-06,
"logits/chosen": -1.2397924661636353,
"logits/rejected": -1.030158281326294,
"logps/chosen": -320.9093322753906,
"logps/rejected": -305.8393859863281,
"loss": 0.4669,
"rewards/accuracies": 0.7487046718597412,
"rewards/chosen": -0.7694060206413269,
"rewards/margins": 0.8478080630302429,
"rewards/rejected": -1.6172634363174438,
"step": 850
},
{
"epoch": 1.045447750037375,
"grad_norm": 67.5,
"learning_rate": 3.6198493575542758e-06,
"logits/chosen": -1.2220094203948975,
"logits/rejected": -0.9582018852233887,
"logps/chosen": -318.0262451171875,
"logps/rejected": -297.5799865722656,
"loss": 0.4691,
"rewards/accuracies": 0.7724999785423279,
"rewards/chosen": -0.7301892042160034,
"rewards/margins": 0.9199609160423279,
"rewards/rejected": -1.6502331495285034,
"step": 875
},
{
"epoch": 1.0753475855882793,
"grad_norm": 73.5,
"learning_rate": 3.564466105449712e-06,
"logits/chosen": -1.089396357536316,
"logits/rejected": -0.8958370685577393,
"logps/chosen": -317.61749267578125,
"logps/rejected": -295.4825134277344,
"loss": 0.4746,
"rewards/accuracies": 0.7574999928474426,
"rewards/chosen": -0.8305737376213074,
"rewards/margins": 0.8526538014411926,
"rewards/rejected": -1.6829102039337158,
"step": 900
},
{
"epoch": 1.1052474211391838,
"grad_norm": 64.5,
"learning_rate": 3.509082853345149e-06,
"logits/chosen": -1.1403405666351318,
"logits/rejected": -0.8662219047546387,
"logps/chosen": -322.0574951171875,
"logps/rejected": -323.2074890136719,
"loss": 0.4641,
"rewards/accuracies": 0.7649999856948853,
"rewards/chosen": -0.6764746308326721,
"rewards/margins": 0.8836804032325745,
"rewards/rejected": -1.5600537061691284,
"step": 925
},
{
"epoch": 1.1351472566900882,
"grad_norm": 66.0,
"learning_rate": 3.453699601240585e-06,
"logits/chosen": -1.2375200986862183,
"logits/rejected": -0.9549773931503296,
"logps/chosen": -321.0874938964844,
"logps/rejected": -306.6000061035156,
"loss": 0.4201,
"rewards/accuracies": 0.8224999904632568,
"rewards/chosen": -0.7068628072738647,
"rewards/margins": 1.0075805187225342,
"rewards/rejected": -1.7146776914596558,
"step": 950
},
{
"epoch": 1.1650470922409926,
"grad_norm": 64.0,
"learning_rate": 3.3983163491360217e-06,
"logits/chosen": -1.1668496131896973,
"logits/rejected": -0.8835460543632507,
"logps/chosen": -320.69000244140625,
"logps/rejected": -323.0425109863281,
"loss": 0.459,
"rewards/accuracies": 0.7825000286102295,
"rewards/chosen": -0.7173047065734863,
"rewards/margins": 0.9243432879447937,
"rewards/rejected": -1.6417040824890137,
"step": 975
},
{
"epoch": 1.1949469277918972,
"grad_norm": 62.75,
"learning_rate": 3.342933097031458e-06,
"logits/chosen": -1.2166632413864136,
"logits/rejected": -0.9624554514884949,
"logps/chosen": -301.0849914550781,
"logps/rejected": -304.3475036621094,
"loss": 0.4656,
"rewards/accuracies": 0.7850000262260437,
"rewards/chosen": -0.7919347882270813,
"rewards/margins": 0.9388867020606995,
"rewards/rejected": -1.73046875,
"step": 1000
},
{
"epoch": 1.1949469277918972,
"eval_logits/chosen": -1.160080075263977,
"eval_logits/rejected": -1.0079379081726074,
"eval_logps/chosen": -326.43280029296875,
"eval_logps/rejected": -305.1102294921875,
"eval_loss": 0.527574896812439,
"eval_rewards/accuracies": 0.6892281174659729,
"eval_rewards/chosen": -0.7565616369247437,
"eval_rewards/margins": 0.6851438879966736,
"eval_rewards/rejected": -1.4416320323944092,
"eval_runtime": 876.3772,
"eval_samples_per_second": 1.697,
"eval_steps_per_second": 0.212,
"step": 1000
},
{
"epoch": 1.2248467633428017,
"grad_norm": 84.0,
"learning_rate": 3.2875498449268944e-06,
"logits/chosen": -1.1776912212371826,
"logits/rejected": -1.050445556640625,
"logps/chosen": -343.0050048828125,
"logps/rejected": -331.1875,
"loss": 0.4213,
"rewards/accuracies": 0.8050000071525574,
"rewards/chosen": -0.6588146686553955,
"rewards/margins": 1.0112402439117432,
"rewards/rejected": -1.670253872871399,
"step": 1025
},
{
"epoch": 1.254746598893706,
"grad_norm": 66.0,
"learning_rate": 3.2321665928223306e-06,
"logits/chosen": -1.2721245288848877,
"logits/rejected": -0.9186769127845764,
"logps/chosen": -316.4549865722656,
"logps/rejected": -315.2925109863281,
"loss": 0.4838,
"rewards/accuracies": 0.7825000286102295,
"rewards/chosen": -0.8342553973197937,
"rewards/margins": 0.83197021484375,
"rewards/rejected": -1.665708065032959,
"step": 1050
},
{
"epoch": 1.2846464344446105,
"grad_norm": 62.75,
"learning_rate": 3.176783340717767e-06,
"logits/chosen": -1.1176886558532715,
"logits/rejected": -0.9960334300994873,
"logps/chosen": -328.32501220703125,
"logps/rejected": -328.3450012207031,
"loss": 0.4538,
"rewards/accuracies": 0.7850000262260437,
"rewards/chosen": -0.7273278832435608,
"rewards/margins": 0.9573754668235779,
"rewards/rejected": -1.684999942779541,
"step": 1075
},
{
"epoch": 1.314546269995515,
"grad_norm": 84.5,
"learning_rate": 3.1214000886132033e-06,
"logits/chosen": -1.1655590534210205,
"logits/rejected": -0.8922329545021057,
"logps/chosen": -314.9700012207031,
"logps/rejected": -301.5050048828125,
"loss": 0.4483,
"rewards/accuracies": 0.7850000262260437,
"rewards/chosen": -0.6278771758079529,
"rewards/margins": 0.9427502155303955,
"rewards/rejected": -1.5707299709320068,
"step": 1100
},
{
"epoch": 1.3444461055464194,
"grad_norm": 69.5,
"learning_rate": 3.06601683650864e-06,
"logits/chosen": -1.2217812538146973,
"logits/rejected": -0.976731538772583,
"logps/chosen": -324.7850036621094,
"logps/rejected": -316.4599914550781,
"loss": 0.4368,
"rewards/accuracies": 0.8149999976158142,
"rewards/chosen": -0.7704944014549255,
"rewards/margins": 0.9598730206489563,
"rewards/rejected": -1.7300487756729126,
"step": 1125
},
{
"epoch": 1.374345941097324,
"grad_norm": 81.0,
"learning_rate": 3.010633584404076e-06,
"logits/chosen": -1.203802466392517,
"logits/rejected": -0.9061872959136963,
"logps/chosen": -330.4175109863281,
"logps/rejected": -312.9987487792969,
"loss": 0.4787,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.7830480933189392,
"rewards/margins": 0.9129126071929932,
"rewards/rejected": -1.6956127882003784,
"step": 1150
},
{
"epoch": 1.4042457766482284,
"grad_norm": 118.0,
"learning_rate": 2.955250332299513e-06,
"logits/chosen": -1.1928298473358154,
"logits/rejected": -0.8999917507171631,
"logps/chosen": -320.2650146484375,
"logps/rejected": -301.5299987792969,
"loss": 0.4698,
"rewards/accuracies": 0.7549999952316284,
"rewards/chosen": -0.8731860518455505,
"rewards/margins": 0.9074377417564392,
"rewards/rejected": -1.7800854444503784,
"step": 1175
},
{
"epoch": 1.434145612199133,
"grad_norm": 65.0,
"learning_rate": 2.8998670801949493e-06,
"logits/chosen": -1.1984894275665283,
"logits/rejected": -0.9353277683258057,
"logps/chosen": -317.625,
"logps/rejected": -325.4075012207031,
"loss": 0.4502,
"rewards/accuracies": 0.7674999833106995,
"rewards/chosen": -0.9375879168510437,
"rewards/margins": 0.9699438214302063,
"rewards/rejected": -1.9072656631469727,
"step": 1200
},
{
"epoch": 1.434145612199133,
"eval_logits/chosen": -1.156473159790039,
"eval_logits/rejected": -1.006028413772583,
"eval_logps/chosen": -327.82794189453125,
"eval_logps/rejected": -306.8521423339844,
"eval_loss": 0.5231196284294128,
"eval_rewards/accuracies": 0.6926843523979187,
"eval_rewards/chosen": -0.8996713161468506,
"eval_rewards/margins": 0.7130159735679626,
"eval_rewards/rejected": -1.6129347085952759,
"eval_runtime": 876.3506,
"eval_samples_per_second": 1.697,
"eval_steps_per_second": 0.212,
"step": 1200
},
{
"epoch": 1.4640454477500373,
"grad_norm": 99.5,
"learning_rate": 2.844483828090386e-06,
"logits/chosen": -1.339633822441101,
"logits/rejected": -1.035129427909851,
"logps/chosen": -332.54998779296875,
"logps/rejected": -319.13751220703125,
"loss": 0.4421,
"rewards/accuracies": 0.7799999713897705,
"rewards/chosen": -0.8549670577049255,
"rewards/margins": 1.0162646770477295,
"rewards/rejected": -1.8712304830551147,
"step": 1225
},
{
"epoch": 1.493945283300942,
"grad_norm": 83.5,
"learning_rate": 2.789100575985822e-06,
"logits/chosen": -1.1476205587387085,
"logits/rejected": -0.9250108599662781,
"logps/chosen": -322.0050048828125,
"logps/rejected": -309.3500061035156,
"loss": 0.4555,
"rewards/accuracies": 0.7549999952316284,
"rewards/chosen": -0.8130224347114563,
"rewards/margins": 0.9434008598327637,
"rewards/rejected": -1.7563867568969727,
"step": 1250
},
{
"epoch": 1.5238451188518463,
"grad_norm": 63.75,
"learning_rate": 2.7337173238812586e-06,
"logits/chosen": -1.2015457153320312,
"logits/rejected": -0.8530246019363403,
"logps/chosen": -309.01251220703125,
"logps/rejected": -297.7825012207031,
"loss": 0.4501,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.836810290813446,
"rewards/margins": 0.9292749166488647,
"rewards/rejected": -1.7654907703399658,
"step": 1275
},
{
"epoch": 1.5537449544027506,
"grad_norm": 67.0,
"learning_rate": 2.6783340717766948e-06,
"logits/chosen": -1.2457306385040283,
"logits/rejected": -1.0591107606887817,
"logps/chosen": -337.9775085449219,
"logps/rejected": -308.5375061035156,
"loss": 0.4248,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.7735278606414795,
"rewards/margins": 1.035646915435791,
"rewards/rejected": -1.8087304830551147,
"step": 1300
},
{
"epoch": 1.5836447899536552,
"grad_norm": 51.0,
"learning_rate": 2.6229508196721314e-06,
"logits/chosen": -1.216982126235962,
"logits/rejected": -0.8925817608833313,
"logps/chosen": -333.2349853515625,
"logps/rejected": -316.62249755859375,
"loss": 0.4568,
"rewards/accuracies": 0.7850000262260437,
"rewards/chosen": -0.8274877667427063,
"rewards/margins": 0.9530566334724426,
"rewards/rejected": -1.7805664539337158,
"step": 1325
},
{
"epoch": 1.6135446255045598,
"grad_norm": 82.0,
"learning_rate": 2.5675675675675675e-06,
"logits/chosen": -1.3132140636444092,
"logits/rejected": -1.004296898841858,
"logps/chosen": -342.4949951171875,
"logps/rejected": -317.69500732421875,
"loss": 0.429,
"rewards/accuracies": 0.8050000071525574,
"rewards/chosen": -0.9008423089981079,
"rewards/margins": 1.0281542539596558,
"rewards/rejected": -1.9285448789596558,
"step": 1350
},
{
"epoch": 1.6434444610554642,
"grad_norm": 116.5,
"learning_rate": 2.5121843154630045e-06,
"logits/chosen": -1.1408294439315796,
"logits/rejected": -0.9321377277374268,
"logps/chosen": -335.291259765625,
"logps/rejected": -321.29376220703125,
"loss": 0.453,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.8236993551254272,
"rewards/margins": 0.9510498046875,
"rewards/rejected": -1.77447509765625,
"step": 1375
},
{
"epoch": 1.6733442966063685,
"grad_norm": 91.0,
"learning_rate": 2.4568010633584403e-06,
"logits/chosen": -1.1858936548233032,
"logits/rejected": -0.9579010009765625,
"logps/chosen": -320.9949951171875,
"logps/rejected": -296.3374938964844,
"loss": 0.4699,
"rewards/accuracies": 0.7425000071525574,
"rewards/chosen": -0.8678625226020813,
"rewards/margins": 0.9215136766433716,
"rewards/rejected": -1.7896509170532227,
"step": 1400
},
{
"epoch": 1.6733442966063685,
"eval_logits/chosen": -1.1674253940582275,
"eval_logits/rejected": -1.0171688795089722,
"eval_logps/chosen": -327.3978576660156,
"eval_logps/rejected": -306.6209716796875,
"eval_loss": 0.5191056728363037,
"eval_rewards/accuracies": 0.6933563947677612,
"eval_rewards/chosen": -0.8476693630218506,
"eval_rewards/margins": 0.7431673407554626,
"eval_rewards/rejected": -1.5906811952590942,
"eval_runtime": 876.3262,
"eval_samples_per_second": 1.697,
"eval_steps_per_second": 0.212,
"step": 1400
},
{
"epoch": 1.703244132157273,
"grad_norm": 82.0,
"learning_rate": 2.401417811253877e-06,
"logits/chosen": -1.1833282709121704,
"logits/rejected": -0.9263910055160522,
"logps/chosen": -324.5150146484375,
"logps/rejected": -316.1650085449219,
"loss": 0.451,
"rewards/accuracies": 0.7799999713897705,
"rewards/chosen": -0.8199084401130676,
"rewards/margins": 0.9980810284614563,
"rewards/rejected": -1.8175097703933716,
"step": 1425
},
{
"epoch": 1.7331439677081777,
"grad_norm": 99.0,
"learning_rate": 2.3460345591493135e-06,
"logits/chosen": -1.1936352252960205,
"logits/rejected": -1.0041576623916626,
"logps/chosen": -350.885009765625,
"logps/rejected": -327.0450134277344,
"loss": 0.4702,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.9122155904769897,
"rewards/margins": 0.9335852265357971,
"rewards/rejected": -1.8462109565734863,
"step": 1450
},
{
"epoch": 1.763043803259082,
"grad_norm": 59.5,
"learning_rate": 2.2906513070447496e-06,
"logits/chosen": -1.3379946947097778,
"logits/rejected": -1.0853075981140137,
"logps/chosen": -299.1099853515625,
"logps/rejected": -299.9725036621094,
"loss": 0.4607,
"rewards/accuracies": 0.7850000262260437,
"rewards/chosen": -0.905989408493042,
"rewards/margins": 1.0363476276397705,
"rewards/rejected": -1.942041039466858,
"step": 1475
},
{
"epoch": 1.7929436388099864,
"grad_norm": 102.0,
"learning_rate": 2.235268054940186e-06,
"logits/chosen": -1.1545830965042114,
"logits/rejected": -0.8675525188446045,
"logps/chosen": -321.79998779296875,
"logps/rejected": -300.4262390136719,
"loss": 0.4854,
"rewards/accuracies": 0.7425000071525574,
"rewards/chosen": -0.8690832257270813,
"rewards/margins": 0.9056127667427063,
"rewards/rejected": -1.7749096155166626,
"step": 1500
},
{
"epoch": 1.822843474360891,
"grad_norm": 60.0,
"learning_rate": 2.179884802835623e-06,
"logits/chosen": -1.2606717348098755,
"logits/rejected": -1.0567920207977295,
"logps/chosen": -328.82501220703125,
"logps/rejected": -304.1050109863281,
"loss": 0.4552,
"rewards/accuracies": 0.7850000262260437,
"rewards/chosen": -0.743670642375946,
"rewards/margins": 1.0134960412979126,
"rewards/rejected": -1.7573193311691284,
"step": 1525
},
{
"epoch": 1.8527433099117956,
"grad_norm": 59.5,
"learning_rate": 2.124501550731059e-06,
"logits/chosen": -1.2121707201004028,
"logits/rejected": -1.002629041671753,
"logps/chosen": -323.5950012207031,
"logps/rejected": -317.5299987792969,
"loss": 0.4645,
"rewards/accuracies": 0.7674999833106995,
"rewards/chosen": -0.9758337140083313,
"rewards/margins": 0.9835278391838074,
"rewards/rejected": -1.959287166595459,
"step": 1550
},
{
"epoch": 1.8826431454627,
"grad_norm": 71.0,
"learning_rate": 2.0691182986264955e-06,
"logits/chosen": -1.296298861503601,
"logits/rejected": NaN,
"logps/chosen": -325.7699890136719,
"logps/rejected": -299.322509765625,
"loss": 0.4515,
"rewards/accuracies": 0.7599999904632568,
"rewards/chosen": -0.8331592082977295,
"rewards/margins": 0.9821679592132568,
"rewards/rejected": -1.8158252239227295,
"step": 1575
},
{
"epoch": 1.9125429810136043,
"grad_norm": 70.0,
"learning_rate": 2.0137350465219317e-06,
"logits/chosen": -1.2260925769805908,
"logits/rejected": -0.9426334500312805,
"logps/chosen": -330.06500244140625,
"logps/rejected": -309.68499755859375,
"loss": 0.4436,
"rewards/accuracies": 0.7649999856948853,
"rewards/chosen": -0.830242931842804,
"rewards/margins": 0.9743407964706421,
"rewards/rejected": -1.804931640625,
"step": 1600
},
{
"epoch": 1.9125429810136043,
"eval_logits/chosen": -1.1829742193222046,
"eval_logits/rejected": -1.033914566040039,
"eval_logps/chosen": -327.43011474609375,
"eval_logps/rejected": -306.69085693359375,
"eval_loss": 0.5206477046012878,
"eval_rewards/accuracies": 0.6974846720695496,
"eval_rewards/chosen": -0.8544062376022339,
"eval_rewards/margins": 0.7440763115882874,
"eval_rewards/rejected": -1.598265290260315,
"eval_runtime": 876.3416,
"eval_samples_per_second": 1.697,
"eval_steps_per_second": 0.212,
"step": 1600
},
{
"epoch": 1.942442816564509,
"grad_norm": 73.5,
"learning_rate": 1.9583517944173683e-06,
"logits/chosen": -1.246303677558899,
"logits/rejected": -0.9357275366783142,
"logps/chosen": -332.3599853515625,
"logps/rejected": -309.1700134277344,
"loss": 0.4702,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.8381909132003784,
"rewards/margins": 0.9997217059135437,
"rewards/rejected": -1.837497591972351,
"step": 1625
},
{
"epoch": 1.9723426521154135,
"grad_norm": 68.5,
"learning_rate": 1.9029685423128047e-06,
"logits/chosen": -1.2618129253387451,
"logits/rejected": -1.0779250860214233,
"logps/chosen": -339.9324951171875,
"logps/rejected": -318.04998779296875,
"loss": 0.4583,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.8390514850616455,
"rewards/margins": 1.0396826267242432,
"rewards/rejected": -1.878564476966858,
"step": 1650
},
{
"epoch": 2.0011959934220362,
"grad_norm": 97.0,
"learning_rate": 1.847585290208241e-06,
"logits/chosen": -1.2342288494110107,
"logits/rejected": -0.9683116674423218,
"logps/chosen": -332.2409362792969,
"logps/rejected": -321.0531005859375,
"loss": 0.424,
"rewards/accuracies": 0.7642487287521362,
"rewards/chosen": -0.7630558013916016,
"rewards/margins": 1.0779491662979126,
"rewards/rejected": -1.8409063816070557,
"step": 1675
},
{
"epoch": 2.0310958289729406,
"grad_norm": 76.0,
"learning_rate": 1.7922020381036776e-06,
"logits/chosen": -1.318371295928955,
"logits/rejected": -1.0083489418029785,
"logps/chosen": -327.114990234375,
"logps/rejected": -336.697509765625,
"loss": 0.3965,
"rewards/accuracies": 0.8475000262260437,
"rewards/chosen": -0.7496582269668579,
"rewards/margins": 1.0661474466323853,
"rewards/rejected": -1.8159960508346558,
"step": 1700
},
{
"epoch": 2.060995664523845,
"grad_norm": 102.5,
"learning_rate": 1.736818785999114e-06,
"logits/chosen": -1.2396435737609863,
"logits/rejected": -0.9828730225563049,
"logps/chosen": -332.7074890136719,
"logps/rejected": -333.37249755859375,
"loss": 0.4101,
"rewards/accuracies": 0.8149999976158142,
"rewards/chosen": -0.7449682354927063,
"rewards/margins": 1.1290674209594727,
"rewards/rejected": -1.8738598823547363,
"step": 1725
},
{
"epoch": 2.09089550007475,
"grad_norm": 62.25,
"learning_rate": 1.6814355338945504e-06,
"logits/chosen": -1.2273823022842407,
"logits/rejected": -0.88829505443573,
"logps/chosen": -322.93499755859375,
"logps/rejected": -300.385009765625,
"loss": 0.4221,
"rewards/accuracies": 0.8050000071525574,
"rewards/chosen": -0.903369128704071,
"rewards/margins": 1.0416357517242432,
"rewards/rejected": -1.9447948932647705,
"step": 1750
},
{
"epoch": 2.120795335625654,
"grad_norm": 86.5,
"learning_rate": 1.6260522817899868e-06,
"logits/chosen": -1.2524548768997192,
"logits/rejected": -1.0671484470367432,
"logps/chosen": -333.92999267578125,
"logps/rejected": -318.6400146484375,
"loss": 0.4119,
"rewards/accuracies": 0.8149999976158142,
"rewards/chosen": -0.7944982647895813,
"rewards/margins": 1.1625818014144897,
"rewards/rejected": -1.9566112756729126,
"step": 1775
},
{
"epoch": 2.1506951711765585,
"grad_norm": 90.0,
"learning_rate": 1.5706690296854231e-06,
"logits/chosen": -1.2237915992736816,
"logits/rejected": -0.956585705280304,
"logps/chosen": -320.30999755859375,
"logps/rejected": -302.2674865722656,
"loss": 0.4528,
"rewards/accuracies": 0.7674999833106995,
"rewards/chosen": -0.9091894626617432,
"rewards/margins": 1.0250316858291626,
"rewards/rejected": -1.9344677925109863,
"step": 1800
},
{
"epoch": 2.1506951711765585,
"eval_logits/chosen": -1.191327452659607,
"eval_logits/rejected": -1.0433924198150635,
"eval_logps/chosen": -327.741943359375,
"eval_logps/rejected": -307.1559143066406,
"eval_loss": 0.5188325047492981,
"eval_rewards/accuracies": 0.6941244602203369,
"eval_rewards/chosen": -0.8884723782539368,
"eval_rewards/margins": 0.7567348480224609,
"eval_rewards/rejected": -1.6454237699508667,
"eval_runtime": 876.3236,
"eval_samples_per_second": 1.697,
"eval_steps_per_second": 0.212,
"step": 1800
},
{
"epoch": 2.180595006727463,
"grad_norm": 74.5,
"learning_rate": 1.5152857775808597e-06,
"logits/chosen": -1.2849377393722534,
"logits/rejected": -0.9589782953262329,
"logps/chosen": -321.9987487792969,
"logps/rejected": -307.2149963378906,
"loss": 0.4031,
"rewards/accuracies": 0.8349999785423279,
"rewards/chosen": -0.7700170874595642,
"rewards/margins": 1.1218103170394897,
"rewards/rejected": -1.8917040824890137,
"step": 1825
},
{
"epoch": 2.2104948422783677,
"grad_norm": 73.5,
"learning_rate": 1.459902525476296e-06,
"logits/chosen": -1.136842131614685,
"logits/rejected": -0.9383144974708557,
"logps/chosen": -319.8525085449219,
"logps/rejected": -333.6600036621094,
"loss": 0.424,
"rewards/accuracies": 0.8075000047683716,
"rewards/chosen": -0.8708154559135437,
"rewards/margins": 1.0324267148971558,
"rewards/rejected": -1.903378963470459,
"step": 1850
},
{
"epoch": 2.240394677829272,
"grad_norm": 72.5,
"learning_rate": 1.4045192733717325e-06,
"logits/chosen": -1.1802786588668823,
"logits/rejected": -0.9680548310279846,
"logps/chosen": -317.48748779296875,
"logps/rejected": -299.19000244140625,
"loss": 0.4262,
"rewards/accuracies": 0.8274999856948853,
"rewards/chosen": -0.8513085842132568,
"rewards/margins": 1.0704809427261353,
"rewards/rejected": -1.9216357469558716,
"step": 1875
},
{
"epoch": 2.2702945133801764,
"grad_norm": 84.0,
"learning_rate": 1.3491360212671688e-06,
"logits/chosen": -1.2559946775436401,
"logits/rejected": -0.9639026522636414,
"logps/chosen": -336.9750061035156,
"logps/rejected": -323.49249267578125,
"loss": 0.4294,
"rewards/accuracies": 0.8025000095367432,
"rewards/chosen": -0.8724609613418579,
"rewards/margins": 1.0881787538528442,
"rewards/rejected": -1.960756778717041,
"step": 1900
},
{
"epoch": 2.3001943489310808,
"grad_norm": 71.0,
"learning_rate": 1.2937527691626054e-06,
"logits/chosen": -1.3266677856445312,
"logits/rejected": -1.0626074075698853,
"logps/chosen": -305.86749267578125,
"logps/rejected": -291.93499755859375,
"loss": 0.4471,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.9192346334457397,
"rewards/margins": 1.0141992568969727,
"rewards/rejected": -1.9337549209594727,
"step": 1925
},
{
"epoch": 2.330094184481985,
"grad_norm": 109.5,
"learning_rate": 1.2383695170580418e-06,
"logits/chosen": -1.1726070642471313,
"logits/rejected": -1.0060466527938843,
"logps/chosen": -309.7799987792969,
"logps/rejected": -311.13751220703125,
"loss": 0.4333,
"rewards/accuracies": 0.7724999785423279,
"rewards/chosen": -0.8455395698547363,
"rewards/margins": 1.0642285346984863,
"rewards/rejected": -1.9100537300109863,
"step": 1950
},
{
"epoch": 2.35999402003289,
"grad_norm": 43.0,
"learning_rate": 1.1829862649534782e-06,
"logits/chosen": -1.189868450164795,
"logits/rejected": -1.0110809803009033,
"logps/chosen": -343.5849914550781,
"logps/rejected": -329.1675109863281,
"loss": 0.4071,
"rewards/accuracies": 0.8224999904632568,
"rewards/chosen": -0.8902783393859863,
"rewards/margins": 1.0464379787445068,
"rewards/rejected": -1.9371508359909058,
"step": 1975
},
{
"epoch": 2.3898938555837943,
"grad_norm": 86.5,
"learning_rate": 1.1276030128489146e-06,
"logits/chosen": -1.3213348388671875,
"logits/rejected": -1.0948954820632935,
"logps/chosen": -331.0174865722656,
"logps/rejected": -307.2900085449219,
"loss": 0.4075,
"rewards/accuracies": 0.8349999785423279,
"rewards/chosen": -0.8052575588226318,
"rewards/margins": 1.1002050638198853,
"rewards/rejected": -1.9058740139007568,
"step": 2000
},
{
"epoch": 2.3898938555837943,
"eval_logits/chosen": -1.1904795169830322,
"eval_logits/rejected": -1.042686104774475,
"eval_logps/chosen": -327.67205810546875,
"eval_logps/rejected": -307.0806579589844,
"eval_loss": 0.5186262726783752,
"eval_rewards/accuracies": 0.6967166662216187,
"eval_rewards/chosen": -0.8813358545303345,
"eval_rewards/margins": 0.7553303837776184,
"eval_rewards/rejected": -1.6366767883300781,
"eval_runtime": 876.3711,
"eval_samples_per_second": 1.697,
"eval_steps_per_second": 0.212,
"step": 2000
},
{
"epoch": 2.4197936911346987,
"grad_norm": 67.0,
"learning_rate": 1.072219760744351e-06,
"logits/chosen": -1.2627320289611816,
"logits/rejected": -1.0026310682296753,
"logps/chosen": -335.5675048828125,
"logps/rejected": -301.01251220703125,
"loss": 0.4202,
"rewards/accuracies": 0.7774999737739563,
"rewards/chosen": -0.8969201445579529,
"rewards/margins": 1.085205078125,
"rewards/rejected": -1.9821679592132568,
"step": 2025
},
{
"epoch": 2.4496935266856035,
"grad_norm": 86.0,
"learning_rate": 1.0168365086397875e-06,
"logits/chosen": -1.2463324069976807,
"logits/rejected": -0.9855798482894897,
"logps/chosen": -332.5849914550781,
"logps/rejected": -324.9624938964844,
"loss": 0.4193,
"rewards/accuracies": 0.7925000190734863,
"rewards/chosen": -0.8326050043106079,
"rewards/margins": 1.0910131931304932,
"rewards/rejected": -1.9229882955551147,
"step": 2050
},
{
"epoch": 2.479593362236508,
"grad_norm": 53.75,
"learning_rate": 9.61453256535224e-07,
"logits/chosen": -1.2372454404830933,
"logits/rejected": -0.9461462497711182,
"logps/chosen": -328.4750061035156,
"logps/rejected": -300.5224914550781,
"loss": 0.4611,
"rewards/accuracies": 0.7524999976158142,
"rewards/chosen": -0.8591150045394897,
"rewards/margins": 0.9913061261177063,
"rewards/rejected": -1.8506054878234863,
"step": 2075
},
{
"epoch": 2.509493197787412,
"grad_norm": 68.0,
"learning_rate": 9.060700044306603e-07,
"logits/chosen": -1.2847473621368408,
"logits/rejected": -1.0720292329788208,
"logps/chosen": -337.26251220703125,
"logps/rejected": -307.17498779296875,
"loss": 0.4101,
"rewards/accuracies": 0.7799999713897705,
"rewards/chosen": -0.8909338116645813,
"rewards/margins": 1.1306884288787842,
"rewards/rejected": -2.021728515625,
"step": 2100
},
{
"epoch": 2.5393930333383166,
"grad_norm": 101.0,
"learning_rate": 8.506867523260968e-07,
"logits/chosen": -1.1994116306304932,
"logits/rejected": -0.9730746746063232,
"logps/chosen": -338.3999938964844,
"logps/rejected": -304.99749755859375,
"loss": 0.4387,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": -0.7841222882270813,
"rewards/margins": 1.0449267625808716,
"rewards/rejected": -1.829746127128601,
"step": 2125
},
{
"epoch": 2.569292868889221,
"grad_norm": 68.5,
"learning_rate": 7.953035002215331e-07,
"logits/chosen": -1.3298254013061523,
"logits/rejected": -1.118627667427063,
"logps/chosen": -309.739990234375,
"logps/rejected": -308.24749755859375,
"loss": 0.4449,
"rewards/accuracies": 0.7774999737739563,
"rewards/chosen": -0.8520336747169495,
"rewards/margins": 0.9700658917427063,
"rewards/rejected": -1.8218945264816284,
"step": 2150
},
{
"epoch": 2.5991927044401257,
"grad_norm": 70.5,
"learning_rate": 7.399202481169695e-07,
"logits/chosen": -1.1831958293914795,
"logits/rejected": NaN,
"logps/chosen": -327.49249267578125,
"logps/rejected": -289.5924987792969,
"loss": 0.4473,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.8408032059669495,
"rewards/margins": 0.9420214891433716,
"rewards/rejected": -1.7829101085662842,
"step": 2175
},
{
"epoch": 2.62909253999103,
"grad_norm": 54.0,
"learning_rate": 6.845369960124059e-07,
"logits/chosen": -1.2656641006469727,
"logits/rejected": -0.9782373309135437,
"logps/chosen": -324.4200134277344,
"logps/rejected": -290.0675048828125,
"loss": 0.4419,
"rewards/accuracies": 0.7825000286102295,
"rewards/chosen": -0.9666149616241455,
"rewards/margins": 1.0030114650726318,
"rewards/rejected": -1.9694628715515137,
"step": 2200
},
{
"epoch": 2.62909253999103,
"eval_logits/chosen": -1.1868830919265747,
"eval_logits/rejected": -1.0399714708328247,
"eval_logps/chosen": -327.6585998535156,
"eval_logps/rejected": -306.9704284667969,
"eval_loss": 0.5178263783454895,
"eval_rewards/accuracies": 0.6993087530136108,
"eval_rewards/chosen": -0.8778404593467712,
"eval_rewards/margins": 0.7548588514328003,
"eval_rewards/rejected": -1.6324502229690552,
"eval_runtime": 876.3727,
"eval_samples_per_second": 1.697,
"eval_steps_per_second": 0.212,
"step": 2200
},
{
"epoch": 2.6589923755419345,
"grad_norm": 67.5,
"learning_rate": 6.291537439078423e-07,
"logits/chosen": -1.2253618240356445,
"logits/rejected": -1.0349105596542358,
"logps/chosen": -336.12249755859375,
"logps/rejected": -311.8275146484375,
"loss": 0.4574,
"rewards/accuracies": 0.7724999785423279,
"rewards/chosen": -0.8752642869949341,
"rewards/margins": 0.9961340427398682,
"rewards/rejected": -1.8713818788528442,
"step": 2225
},
{
"epoch": 2.688892211092839,
"grad_norm": 100.0,
"learning_rate": 5.737704918032787e-07,
"logits/chosen": -1.2597771883010864,
"logits/rejected": -0.9909564256668091,
"logps/chosen": -326.6600036621094,
"logps/rejected": -316.19000244140625,
"loss": 0.4751,
"rewards/accuracies": 0.7674999833106995,
"rewards/chosen": -0.9248193502426147,
"rewards/margins": 0.9592040777206421,
"rewards/rejected": -1.8837096691131592,
"step": 2250
},
{
"epoch": 2.7187920466437436,
"grad_norm": 76.0,
"learning_rate": 5.183872396987152e-07,
"logits/chosen": -1.2072705030441284,
"logits/rejected": -0.9592925906181335,
"logps/chosen": -322.36248779296875,
"logps/rejected": -315.8374938964844,
"loss": 0.391,
"rewards/accuracies": 0.8274999856948853,
"rewards/chosen": -0.7576141357421875,
"rewards/margins": 1.160730004310608,
"rewards/rejected": -1.9182031154632568,
"step": 2275
},
{
"epoch": 2.748691882194648,
"grad_norm": 53.0,
"learning_rate": 4.630039875941516e-07,
"logits/chosen": -1.287199854850769,
"logits/rejected": -0.9606054425239563,
"logps/chosen": -344.7650146484375,
"logps/rejected": -331.24749755859375,
"loss": 0.4177,
"rewards/accuracies": 0.8149999976158142,
"rewards/chosen": -0.7748047113418579,
"rewards/margins": 1.1645703315734863,
"rewards/rejected": -1.9394140243530273,
"step": 2300
},
{
"epoch": 2.7785917177455524,
"grad_norm": 87.0,
"learning_rate": 4.07620735489588e-07,
"logits/chosen": -1.2260528802871704,
"logits/rejected": -1.0005972385406494,
"logps/chosen": -312.9624938964844,
"logps/rejected": -323.0400085449219,
"loss": 0.3917,
"rewards/accuracies": 0.8349999785423279,
"rewards/chosen": -0.7925238013267517,
"rewards/margins": 1.185449242591858,
"rewards/rejected": -1.9780443906784058,
"step": 2325
},
{
"epoch": 2.8084915532964567,
"grad_norm": 56.5,
"learning_rate": 3.5223748338502434e-07,
"logits/chosen": -1.2027392387390137,
"logits/rejected": -0.989107608795166,
"logps/chosen": -321.3762512207031,
"logps/rejected": -318.11749267578125,
"loss": 0.4052,
"rewards/accuracies": 0.8174999952316284,
"rewards/chosen": -0.8751891851425171,
"rewards/margins": 1.1021533012390137,
"rewards/rejected": -1.976718783378601,
"step": 2350
},
{
"epoch": 2.838391388847361,
"grad_norm": 54.5,
"learning_rate": 2.968542312804608e-07,
"logits/chosen": -1.2425882816314697,
"logits/rejected": -0.9340093731880188,
"logps/chosen": -335.12249755859375,
"logps/rejected": -320.2049865722656,
"loss": 0.4115,
"rewards/accuracies": 0.8224999904632568,
"rewards/chosen": -0.8292675614356995,
"rewards/margins": 1.1182934045791626,
"rewards/rejected": -1.9483104944229126,
"step": 2375
},
{
"epoch": 2.868291224398266,
"grad_norm": 87.0,
"learning_rate": 2.4147097917589725e-07,
"logits/chosen": -1.3012477159500122,
"logits/rejected": -1.0664279460906982,
"logps/chosen": -293.489990234375,
"logps/rejected": -285.197509765625,
"loss": 0.4277,
"rewards/accuracies": 0.8025000095367432,
"rewards/chosen": -0.8684576153755188,
"rewards/margins": 1.069272518157959,
"rewards/rejected": -1.9371191263198853,
"step": 2400
},
{
"epoch": 2.868291224398266,
"eval_logits/chosen": -1.1853525638580322,
"eval_logits/rejected": -1.0373817682266235,
"eval_logps/chosen": -327.3817138671875,
"eval_logps/rejected": -306.81451416015625,
"eval_loss": 0.5165102481842041,
"eval_rewards/accuracies": 0.7006528377532959,
"eval_rewards/chosen": -0.8549529314041138,
"eval_rewards/margins": 0.7583125829696655,
"eval_rewards/rejected": -1.6133127212524414,
"eval_runtime": 876.3322,
"eval_samples_per_second": 1.697,
"eval_steps_per_second": 0.212,
"step": 2400
},
{
"epoch": 2.8981910599491703,
"grad_norm": 46.0,
"learning_rate": 1.8608772707133363e-07,
"logits/chosen": -1.356745958328247,
"logits/rejected": -1.0496530532836914,
"logps/chosen": -319.9649963378906,
"logps/rejected": -309.7025146484375,
"loss": 0.4037,
"rewards/accuracies": 0.8025000095367432,
"rewards/chosen": -0.8254479765892029,
"rewards/margins": 1.1192578077316284,
"rewards/rejected": -1.9445117712020874,
"step": 2425
},
{
"epoch": 2.9280908955000746,
"grad_norm": 70.5,
"learning_rate": 1.3070447496677006e-07,
"logits/chosen": -1.2751880884170532,
"logits/rejected": -1.0796799659729004,
"logps/chosen": -316.9425048828125,
"logps/rejected": -325.7550048828125,
"loss": 0.4306,
"rewards/accuracies": 0.7724999785423279,
"rewards/chosen": -0.8079773187637329,
"rewards/margins": 1.000207543373108,
"rewards/rejected": -1.8083984851837158,
"step": 2450
},
{
"epoch": 2.9579907310509794,
"grad_norm": 74.0,
"learning_rate": 7.532122286220647e-08,
"logits/chosen": -1.2595221996307373,
"logits/rejected": -1.0140166282653809,
"logps/chosen": -320.6000061035156,
"logps/rejected": -318.6600036621094,
"loss": 0.4808,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.000390648841858,
"rewards/margins": 0.931530773639679,
"rewards/rejected": -1.9319677352905273,
"step": 2475
},
{
"epoch": 2.987890566601884,
"grad_norm": 78.5,
"learning_rate": 1.993797075764289e-08,
"logits/chosen": -1.2403491735458374,
"logits/rejected": -0.9544309973716736,
"logps/chosen": -343.76251220703125,
"logps/rejected": -336.38250732421875,
"loss": 0.4225,
"rewards/accuracies": 0.8149999976158142,
"rewards/chosen": -0.7856341600418091,
"rewards/margins": 1.0573632717132568,
"rewards/rejected": -1.8428466320037842,
"step": 2500
}
],
"logging_steps": 25,
"max_steps": 2508,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}