Gemma-2B-It-ORPO / trainer_state.json
chchen's picture
End of training
5fd6e30 verified
{
"best_metric": 1.3996269702911377,
"best_model_checkpoint": "saves/Gemma-2B-It/lora/orpo/checkpoint-1500",
"epoch": 2.997999555456768,
"eval_steps": 500,
"global_step": 1686,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.017781729273171815,
"grad_norm": 1.9148550033569336,
"learning_rate": 4.9995745934141085e-06,
"logits/chosen": -22.071788787841797,
"logits/rejected": -21.994897842407227,
"logps/chosen": -1.9321304559707642,
"logps/rejected": -2.141274929046631,
"loss": 2.0148,
"odds_ratio_loss": 0.8263328671455383,
"rewards/accuracies": 0.42500001192092896,
"rewards/chosen": -0.19321303069591522,
"rewards/margins": 0.020914455875754356,
"rewards/rejected": -0.21412746608257294,
"sft_loss": 1.9321304559707642,
"step": 10
},
{
"epoch": 0.03556345854634363,
"grad_norm": 4.772641181945801,
"learning_rate": 4.9982812903243405e-06,
"logits/chosen": -21.850475311279297,
"logits/rejected": -22.140661239624023,
"logps/chosen": -2.000199556350708,
"logps/rejected": -1.9620949029922485,
"loss": 2.0912,
"odds_ratio_loss": 0.9096724390983582,
"rewards/accuracies": 0.4437499940395355,
"rewards/chosen": -0.2000199258327484,
"rewards/margins": -0.003810454858466983,
"rewards/rejected": -0.19620949029922485,
"sft_loss": 2.000199556350708,
"step": 20
},
{
"epoch": 0.05334518781951545,
"grad_norm": 1.9645005464553833,
"learning_rate": 4.996120496405222e-06,
"logits/chosen": -22.181926727294922,
"logits/rejected": -22.236988067626953,
"logps/chosen": -1.9057893753051758,
"logps/rejected": -2.2623982429504395,
"loss": 1.9768,
"odds_ratio_loss": 0.7102858424186707,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.19057895243167877,
"rewards/margins": 0.03566090017557144,
"rewards/rejected": -0.22623984515666962,
"sft_loss": 1.9057893753051758,
"step": 30
},
{
"epoch": 0.07112691709268726,
"grad_norm": 1.9976199865341187,
"learning_rate": 4.99309296196014e-06,
"logits/chosen": -22.178194046020508,
"logits/rejected": -22.227825164794922,
"logps/chosen": -1.8588358163833618,
"logps/rejected": -2.0477230548858643,
"loss": 1.94,
"odds_ratio_loss": 0.8119063377380371,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.18588361144065857,
"rewards/margins": 0.01888870820403099,
"rewards/rejected": -0.20477227866649628,
"sft_loss": 1.8588358163833618,
"step": 40
},
{
"epoch": 0.08890864636585907,
"grad_norm": 1.6904418468475342,
"learning_rate": 4.989199738255166e-06,
"logits/chosen": -22.063446044921875,
"logits/rejected": -22.088878631591797,
"logps/chosen": -1.8785845041275024,
"logps/rejected": -2.0510427951812744,
"loss": 1.9601,
"odds_ratio_loss": 0.81475830078125,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.187858447432518,
"rewards/margins": 0.0172458253800869,
"rewards/rejected": -0.2051042765378952,
"sft_loss": 1.8785845041275024,
"step": 50
},
{
"epoch": 0.1066903756390309,
"grad_norm": 1.597947359085083,
"learning_rate": 4.984442177154031e-06,
"logits/chosen": -22.34821319580078,
"logits/rejected": -22.315746307373047,
"logps/chosen": -1.9788051843643188,
"logps/rejected": -2.084188461303711,
"loss": 2.0713,
"odds_ratio_loss": 0.924887478351593,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.19788053631782532,
"rewards/margins": 0.01053832471370697,
"rewards/rejected": -0.20841887593269348,
"sft_loss": 1.9788051843643188,
"step": 60
},
{
"epoch": 0.12447210491220272,
"grad_norm": 2.274142026901245,
"learning_rate": 4.978821930648704e-06,
"logits/chosen": -22.288013458251953,
"logits/rejected": -22.197546005249023,
"logps/chosen": -1.9143011569976807,
"logps/rejected": -1.8838450908660889,
"loss": 2.0168,
"odds_ratio_loss": 1.0252134799957275,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -0.19143010675907135,
"rewards/margins": -0.0030455999076366425,
"rewards/rejected": -0.1883845031261444,
"sft_loss": 1.9143011569976807,
"step": 70
},
{
"epoch": 0.14225383418537452,
"grad_norm": 2.63519549369812,
"learning_rate": 4.97234095028576e-06,
"logits/chosen": -22.663928985595703,
"logits/rejected": -22.51036834716797,
"logps/chosen": -1.8908298015594482,
"logps/rejected": -1.9426231384277344,
"loss": 1.9749,
"odds_ratio_loss": 0.8411667943000793,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.1890830099582672,
"rewards/margins": 0.005179307423532009,
"rewards/rejected": -0.1942623108625412,
"sft_loss": 1.8908298015594482,
"step": 80
},
{
"epoch": 0.16003556345854633,
"grad_norm": 1.5385671854019165,
"learning_rate": 4.965001486488743e-06,
"logits/chosen": -22.35540199279785,
"logits/rejected": -22.453685760498047,
"logps/chosen": -1.6930122375488281,
"logps/rejected": -1.8899803161621094,
"loss": 1.7643,
"odds_ratio_loss": 0.7125129699707031,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.16930122673511505,
"rewards/margins": 0.019696824252605438,
"rewards/rejected": -0.1889980286359787,
"sft_loss": 1.6930122375488281,
"step": 90
},
{
"epoch": 0.17781729273171815,
"grad_norm": 1.6486105918884277,
"learning_rate": 4.956806087776732e-06,
"logits/chosen": -22.912220001220703,
"logits/rejected": -22.764265060424805,
"logps/chosen": -1.7519505023956299,
"logps/rejected": -2.004110813140869,
"loss": 1.8222,
"odds_ratio_loss": 0.7028593420982361,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.17519506812095642,
"rewards/margins": 0.025216031819581985,
"rewards/rejected": -0.2004111111164093,
"sft_loss": 1.7519505023956299,
"step": 100
},
{
"epoch": 0.19559902200489,
"grad_norm": 2.1504974365234375,
"learning_rate": 4.947757599879411e-06,
"logits/chosen": -22.865467071533203,
"logits/rejected": -23.005489349365234,
"logps/chosen": -1.727837324142456,
"logps/rejected": -1.8906141519546509,
"loss": 1.8026,
"odds_ratio_loss": 0.7475000023841858,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.1727837324142456,
"rewards/margins": 0.016277695074677467,
"rewards/rejected": -0.18906141817569733,
"sft_loss": 1.727837324142456,
"step": 110
},
{
"epoch": 0.2133807512780618,
"grad_norm": 3.6934735774993896,
"learning_rate": 4.937859164748931e-06,
"logits/chosen": -22.859783172607422,
"logits/rejected": -23.031169891357422,
"logps/chosen": -1.5483535528182983,
"logps/rejected": -1.6135647296905518,
"loss": 1.624,
"odds_ratio_loss": 0.7560455203056335,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.15483535826206207,
"rewards/margins": 0.006521114148199558,
"rewards/rejected": -0.16135647892951965,
"sft_loss": 1.5483535528182983,
"step": 120
},
{
"epoch": 0.23116248055123362,
"grad_norm": 3.72802734375,
"learning_rate": 4.92711421946891e-06,
"logits/chosen": -23.100276947021484,
"logits/rejected": -22.69415283203125,
"logps/chosen": -1.5568244457244873,
"logps/rejected": -1.8098100423812866,
"loss": 1.6294,
"odds_ratio_loss": 0.7258477210998535,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.15568244457244873,
"rewards/margins": 0.025298580527305603,
"rewards/rejected": -0.18098104000091553,
"sft_loss": 1.5568244457244873,
"step": 130
},
{
"epoch": 0.24894420982440543,
"grad_norm": 5.469463348388672,
"learning_rate": 4.915526495060961e-06,
"logits/chosen": -23.371618270874023,
"logits/rejected": -23.214031219482422,
"logps/chosen": -1.4800597429275513,
"logps/rejected": -1.746651291847229,
"loss": 1.5507,
"odds_ratio_loss": 0.7065833806991577,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.14800596237182617,
"rewards/margins": 0.026659172028303146,
"rewards/rejected": -0.17466513812541962,
"sft_loss": 1.4800597429275513,
"step": 140
},
{
"epoch": 0.26672593909757725,
"grad_norm": 4.669017314910889,
"learning_rate": 4.903100015189153e-06,
"logits/chosen": -22.959320068359375,
"logits/rejected": -23.156007766723633,
"logps/chosen": -1.5119131803512573,
"logps/rejected": -1.708356261253357,
"loss": 1.5853,
"odds_ratio_loss": 0.7340201735496521,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.1511913239955902,
"rewards/margins": 0.01964429020881653,
"rewards/rejected": -0.17083561420440674,
"sft_loss": 1.5119131803512573,
"step": 150
},
{
"epoch": 0.28450766837074903,
"grad_norm": 1.5560555458068848,
"learning_rate": 4.889839094762848e-06,
"logits/chosen": -22.915985107421875,
"logits/rejected": -22.794408798217773,
"logps/chosen": -1.6158710718154907,
"logps/rejected": -1.7288596630096436,
"loss": 1.6909,
"odds_ratio_loss": 0.7498828768730164,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.16158713400363922,
"rewards/margins": 0.011298839002847672,
"rewards/rejected": -0.1728859841823578,
"sft_loss": 1.6158710718154907,
"step": 160
},
{
"epoch": 0.3022893976439209,
"grad_norm": 1.565077304840088,
"learning_rate": 4.875748338438416e-06,
"logits/chosen": -23.140369415283203,
"logits/rejected": -23.20174789428711,
"logps/chosen": -1.532865285873413,
"logps/rejected": -1.6764837503433228,
"loss": 1.6069,
"odds_ratio_loss": 0.7403478026390076,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.15328654646873474,
"rewards/margins": 0.014361831359565258,
"rewards/rejected": -0.16764836013317108,
"sft_loss": 1.532865285873413,
"step": 170
},
{
"epoch": 0.32007112691709266,
"grad_norm": 3.263695240020752,
"learning_rate": 4.8608326390203386e-06,
"logits/chosen": -22.981613159179688,
"logits/rejected": -22.818286895751953,
"logps/chosen": -1.485670804977417,
"logps/rejected": -1.6812422275543213,
"loss": 1.5542,
"odds_ratio_loss": 0.6854217052459717,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.1485670804977417,
"rewards/margins": 0.019557146355509758,
"rewards/rejected": -0.1681242287158966,
"sft_loss": 1.485670804977417,
"step": 180
},
{
"epoch": 0.3378528561902645,
"grad_norm": 2.1444835662841797,
"learning_rate": 4.845097175762251e-06,
"logits/chosen": -23.199800491333008,
"logits/rejected": -23.2564640045166,
"logps/chosen": -1.4873155355453491,
"logps/rejected": -1.5498250722885132,
"loss": 1.5614,
"odds_ratio_loss": 0.7410100698471069,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": -0.1487315446138382,
"rewards/margins": 0.006250949110835791,
"rewards/rejected": -0.15498249232769012,
"sft_loss": 1.4873155355453491,
"step": 190
},
{
"epoch": 0.3556345854634363,
"grad_norm": 5.516879558563232,
"learning_rate": 4.8285474125685286e-06,
"logits/chosen": -23.00992774963379,
"logits/rejected": -22.893043518066406,
"logps/chosen": -1.520996332168579,
"logps/rejected": -1.6076465845108032,
"loss": 1.5966,
"odds_ratio_loss": 0.7559183239936829,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.1520996391773224,
"rewards/margins": 0.008664996363222599,
"rewards/rejected": -0.1607646495103836,
"sft_loss": 1.520996332168579,
"step": 200
},
{
"epoch": 0.37341631473660813,
"grad_norm": 1.6477737426757812,
"learning_rate": 4.811189096097025e-06,
"logits/chosen": -22.601619720458984,
"logits/rejected": -22.704158782958984,
"logps/chosen": -1.5167438983917236,
"logps/rejected": -1.700338363647461,
"loss": 1.5896,
"odds_ratio_loss": 0.7285597920417786,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.15167437493801117,
"rewards/margins": 0.018359454348683357,
"rewards/rejected": -0.17003384232521057,
"sft_loss": 1.5167438983917236,
"step": 210
},
{
"epoch": 0.39119804400978,
"grad_norm": 2.6526737213134766,
"learning_rate": 4.793028253763633e-06,
"logits/chosen": -22.879850387573242,
"logits/rejected": -22.78567123413086,
"logps/chosen": -1.4604800939559937,
"logps/rejected": -1.6260970830917358,
"loss": 1.537,
"odds_ratio_loss": 0.7654477953910828,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.14604800939559937,
"rewards/margins": 0.016561714932322502,
"rewards/rejected": -0.16260972619056702,
"sft_loss": 1.4604800939559937,
"step": 220
},
{
"epoch": 0.40897977328295176,
"grad_norm": 2.860865354537964,
"learning_rate": 4.774071191649352e-06,
"logits/chosen": -22.46622657775879,
"logits/rejected": -22.480607986450195,
"logps/chosen": -1.376908779144287,
"logps/rejected": -1.6316293478012085,
"loss": 1.4435,
"odds_ratio_loss": 0.6654683351516724,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.13769087195396423,
"rewards/margins": 0.025472048670053482,
"rewards/rejected": -0.16316291689872742,
"sft_loss": 1.376908779144287,
"step": 230
},
{
"epoch": 0.4267615025561236,
"grad_norm": 2.685337781906128,
"learning_rate": 4.7543244923105975e-06,
"logits/chosen": -22.682777404785156,
"logits/rejected": -22.806440353393555,
"logps/chosen": -1.5592434406280518,
"logps/rejected": -1.5708439350128174,
"loss": 1.6395,
"odds_ratio_loss": 0.8026041984558105,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.15592436492443085,
"rewards/margins": 0.0011600303696468472,
"rewards/rejected": -0.1570843905210495,
"sft_loss": 1.5592434406280518,
"step": 240
},
{
"epoch": 0.4445432318292954,
"grad_norm": 1.3707021474838257,
"learning_rate": 4.733795012493506e-06,
"logits/chosen": -22.8146915435791,
"logits/rejected": -22.913793563842773,
"logps/chosen": -1.5595623254776,
"logps/rejected": -1.6163349151611328,
"loss": 1.6357,
"odds_ratio_loss": 0.7609250545501709,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.1559562385082245,
"rewards/margins": 0.005677259061485529,
"rewards/rejected": -0.1616334766149521,
"sft_loss": 1.5595623254776,
"step": 250
},
{
"epoch": 0.46232496110246724,
"grad_norm": 1.3753399848937988,
"learning_rate": 4.712489880753035e-06,
"logits/chosen": -22.511287689208984,
"logits/rejected": -22.446317672729492,
"logps/chosen": -1.3289070129394531,
"logps/rejected": -1.4945290088653564,
"loss": 1.3952,
"odds_ratio_loss": 0.6626302003860474,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.1328907012939453,
"rewards/margins": 0.016562188044190407,
"rewards/rejected": -0.14945289492607117,
"sft_loss": 1.3289070129394531,
"step": 260
},
{
"epoch": 0.480106690375639,
"grad_norm": 5.678652286529541,
"learning_rate": 4.690416494977673e-06,
"logits/chosen": -22.829517364501953,
"logits/rejected": -22.87631607055664,
"logps/chosen": -1.4606059789657593,
"logps/rejected": -1.6754430532455444,
"loss": 1.5279,
"odds_ratio_loss": 0.6730555295944214,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.14606061577796936,
"rewards/margins": 0.02148369327187538,
"rewards/rejected": -0.16754429042339325,
"sft_loss": 1.4606059789657593,
"step": 270
},
{
"epoch": 0.49788841964881086,
"grad_norm": 1.2074130773544312,
"learning_rate": 4.667582519820639e-06,
"logits/chosen": -22.504804611206055,
"logits/rejected": -22.659706115722656,
"logps/chosen": -1.479263424873352,
"logps/rejected": -1.5646381378173828,
"loss": 1.5534,
"odds_ratio_loss": 0.7413693070411682,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.14792636036872864,
"rewards/margins": 0.008537469431757927,
"rewards/rejected": -0.15646381676197052,
"sft_loss": 1.479263424873352,
"step": 280
},
{
"epoch": 0.5156701489219827,
"grad_norm": 2.7386221885681152,
"learning_rate": 4.643995884038443e-06,
"logits/chosen": -22.59560775756836,
"logits/rejected": -22.57559585571289,
"logps/chosen": -1.3870899677276611,
"logps/rejected": -1.5722427368164062,
"loss": 1.4562,
"odds_ratio_loss": 0.6912243962287903,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.13870900869369507,
"rewards/margins": 0.01851527765393257,
"rewards/rejected": -0.15722428262233734,
"sft_loss": 1.3870899677276611,
"step": 290
},
{
"epoch": 0.5334518781951545,
"grad_norm": 1.5327345132827759,
"learning_rate": 4.6196647777377475e-06,
"logits/chosen": -22.43231201171875,
"logits/rejected": -22.38507652282715,
"logps/chosen": -1.4321014881134033,
"logps/rejected": -1.4831379652023315,
"loss": 1.5072,
"odds_ratio_loss": 0.7509574294090271,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.14321014285087585,
"rewards/margins": 0.005103647243231535,
"rewards/rejected": -0.14831380546092987,
"sft_loss": 1.4321014881134033,
"step": 300
},
{
"epoch": 0.5512336074683263,
"grad_norm": 1.276062250137329,
"learning_rate": 4.59459764953147e-06,
"logits/chosen": -22.478355407714844,
"logits/rejected": -22.29865264892578,
"logps/chosen": -1.445011854171753,
"logps/rejected": -1.550065279006958,
"loss": 1.5147,
"odds_ratio_loss": 0.6971566081047058,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -0.14450117945671082,
"rewards/margins": 0.010505353100597858,
"rewards/rejected": -0.1550065577030182,
"sft_loss": 1.445011854171753,
"step": 310
},
{
"epoch": 0.5690153367414981,
"grad_norm": 2.80613112449646,
"learning_rate": 4.568803203605133e-06,
"logits/chosen": -22.582855224609375,
"logits/rejected": -22.391347885131836,
"logps/chosen": -1.3941065073013306,
"logps/rejected": -1.580993413925171,
"loss": 1.4659,
"odds_ratio_loss": 0.7180419564247131,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.13941065967082977,
"rewards/margins": 0.018688684329390526,
"rewards/rejected": -0.15809933841228485,
"sft_loss": 1.3941065073013306,
"step": 320
},
{
"epoch": 0.58679706601467,
"grad_norm": 5.356297492980957,
"learning_rate": 4.542290396694462e-06,
"logits/chosen": -22.250286102294922,
"logits/rejected": -22.175914764404297,
"logps/chosen": -1.4387528896331787,
"logps/rejected": -1.5810470581054688,
"loss": 1.5106,
"odds_ratio_loss": 0.7184728980064392,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.14387528598308563,
"rewards/margins": 0.014229406602680683,
"rewards/rejected": -0.15810470283031464,
"sft_loss": 1.4387528896331787,
"step": 330
},
{
"epoch": 0.6045787952878418,
"grad_norm": 8.996047019958496,
"learning_rate": 4.515068434975298e-06,
"logits/chosen": -22.00839614868164,
"logits/rejected": -22.072261810302734,
"logps/chosen": -1.4673653841018677,
"logps/rejected": -1.6608636379241943,
"loss": 1.5376,
"odds_ratio_loss": 0.7021427154541016,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.14673654735088348,
"rewards/margins": 0.019349832087755203,
"rewards/rejected": -0.16608639061450958,
"sft_loss": 1.4673653841018677,
"step": 340
},
{
"epoch": 0.6223605245610135,
"grad_norm": 1.460726261138916,
"learning_rate": 4.487146770866887e-06,
"logits/chosen": -22.291297912597656,
"logits/rejected": -22.382854461669922,
"logps/chosen": -1.406706690788269,
"logps/rejected": -1.4625658988952637,
"loss": 1.4815,
"odds_ratio_loss": 0.747775673866272,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.14067067205905914,
"rewards/margins": 0.005585917271673679,
"rewards/rejected": -0.14625659584999084,
"sft_loss": 1.406706690788269,
"step": 350
},
{
"epoch": 0.6401422538341853,
"grad_norm": 1.722812533378601,
"learning_rate": 4.458535099749666e-06,
"logits/chosen": -22.276639938354492,
"logits/rejected": -22.166675567626953,
"logps/chosen": -1.5117685794830322,
"logps/rejected": -1.5999605655670166,
"loss": 1.5911,
"odds_ratio_loss": 0.7935177087783813,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.15117685496807098,
"rewards/margins": 0.008819197304546833,
"rewards/rejected": -0.15999604761600494,
"sft_loss": 1.5117685794830322,
"step": 360
},
{
"epoch": 0.6579239831073572,
"grad_norm": 2.568336248397827,
"learning_rate": 4.429243356598694e-06,
"logits/chosen": -21.958419799804688,
"logits/rejected": -21.927824020385742,
"logps/chosen": -1.4804319143295288,
"logps/rejected": -1.6579450368881226,
"loss": 1.5496,
"odds_ratio_loss": 0.6912356615066528,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.1480431854724884,
"rewards/margins": 0.017751310020685196,
"rewards/rejected": -0.1657945215702057,
"sft_loss": 1.4804319143295288,
"step": 370
},
{
"epoch": 0.675705712380529,
"grad_norm": 1.4206441640853882,
"learning_rate": 4.399281712533875e-06,
"logits/chosen": -22.067081451416016,
"logits/rejected": -22.091421127319336,
"logps/chosen": -1.4124424457550049,
"logps/rejected": -1.4996305704116821,
"loss": 1.4873,
"odds_ratio_loss": 0.7487770318984985,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.14124426245689392,
"rewards/margins": 0.008718819357454777,
"rewards/rejected": -0.14996306598186493,
"sft_loss": 1.4124424457550049,
"step": 380
},
{
"epoch": 0.6934874416537008,
"grad_norm": 2.3953347206115723,
"learning_rate": 4.368660571288192e-06,
"logits/chosen": -22.193960189819336,
"logits/rejected": -22.225393295288086,
"logps/chosen": -1.394415020942688,
"logps/rejected": -1.500723123550415,
"loss": 1.4702,
"odds_ratio_loss": 0.7577108144760132,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.13944150507450104,
"rewards/margins": 0.010630805976688862,
"rewards/rejected": -0.15007230639457703,
"sft_loss": 1.394415020942688,
"step": 390
},
{
"epoch": 0.7112691709268726,
"grad_norm": 1.4220997095108032,
"learning_rate": 4.337390565595163e-06,
"logits/chosen": -21.68547248840332,
"logits/rejected": -21.761310577392578,
"logps/chosen": -1.464005708694458,
"logps/rejected": -1.5315691232681274,
"loss": 1.5392,
"odds_ratio_loss": 0.75159752368927,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.1464005708694458,
"rewards/margins": 0.006756337825208902,
"rewards/rejected": -0.15315690636634827,
"sft_loss": 1.464005708694458,
"step": 400
},
{
"epoch": 0.7290509002000445,
"grad_norm": 1.8401212692260742,
"learning_rate": 4.305482553496786e-06,
"logits/chosen": -21.259353637695312,
"logits/rejected": -21.159082412719727,
"logps/chosen": -1.3970698118209839,
"logps/rejected": -1.5361021757125854,
"loss": 1.4689,
"odds_ratio_loss": 0.7183545827865601,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.13970699906349182,
"rewards/margins": 0.01390322856605053,
"rewards/rejected": -0.1536101996898651,
"sft_loss": 1.3970698118209839,
"step": 410
},
{
"epoch": 0.7468326294732163,
"grad_norm": 2.2964348793029785,
"learning_rate": 4.272947614573244e-06,
"logits/chosen": -21.679210662841797,
"logits/rejected": -21.884136199951172,
"logps/chosen": -1.447422981262207,
"logps/rejected": -1.5282857418060303,
"loss": 1.5205,
"odds_ratio_loss": 0.730276346206665,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.14474229514598846,
"rewards/margins": 0.008086279034614563,
"rewards/rejected": -0.15282857418060303,
"sft_loss": 1.447422981262207,
"step": 420
},
{
"epoch": 0.7646143587463881,
"grad_norm": 1.2190438508987427,
"learning_rate": 4.23979704609569e-06,
"logits/chosen": -21.96237564086914,
"logits/rejected": -22.065784454345703,
"logps/chosen": -1.398108959197998,
"logps/rejected": -1.495884656906128,
"loss": 1.4676,
"odds_ratio_loss": 0.6946425437927246,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.13981090486049652,
"rewards/margins": 0.009777549654245377,
"rewards/rejected": -0.1495884507894516,
"sft_loss": 1.398108959197998,
"step": 430
},
{
"epoch": 0.78239608801956,
"grad_norm": 2.3040215969085693,
"learning_rate": 4.206042359103435e-06,
"logits/chosen": -21.56629180908203,
"logits/rejected": -21.716127395629883,
"logps/chosen": -1.487396001815796,
"logps/rejected": -1.617078185081482,
"loss": 1.5602,
"odds_ratio_loss": 0.7281696796417236,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.14873960614204407,
"rewards/margins": 0.012968212366104126,
"rewards/rejected": -0.1617078334093094,
"sft_loss": 1.487396001815796,
"step": 440
},
{
"epoch": 0.8001778172927317,
"grad_norm": 2.5727310180664062,
"learning_rate": 4.17169527440691e-06,
"logits/chosen": -21.884145736694336,
"logits/rejected": -21.738811492919922,
"logps/chosen": -1.4501018524169922,
"logps/rejected": -1.4668500423431396,
"loss": 1.5286,
"odds_ratio_loss": 0.7853611707687378,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": -0.14501020312309265,
"rewards/margins": 0.0016747992485761642,
"rewards/rejected": -0.14668500423431396,
"sft_loss": 1.4501018524169922,
"step": 450
},
{
"epoch": 0.8179595465659035,
"grad_norm": 2.54972243309021,
"learning_rate": 4.136767718517797e-06,
"logits/chosen": -21.746496200561523,
"logits/rejected": -21.7362060546875,
"logps/chosen": -1.3023537397384644,
"logps/rejected": -1.5028297901153564,
"loss": 1.368,
"odds_ratio_loss": 0.6567283868789673,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.13023535907268524,
"rewards/margins": 0.020047612488269806,
"rewards/rejected": -0.15028299391269684,
"sft_loss": 1.3023537397384644,
"step": 460
},
{
"epoch": 0.8357412758390753,
"grad_norm": 6.595831871032715,
"learning_rate": 4.1012718195077196e-06,
"logits/chosen": -21.96458625793457,
"logits/rejected": -22.172712326049805,
"logps/chosen": -1.4211918115615845,
"logps/rejected": -1.4663982391357422,
"loss": 1.4961,
"odds_ratio_loss": 0.7494389414787292,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.14211918413639069,
"rewards/margins": 0.004520639777183533,
"rewards/rejected": -0.14663982391357422,
"sft_loss": 1.4211918115615845,
"step": 470
},
{
"epoch": 0.8535230051122472,
"grad_norm": 3.3689377307891846,
"learning_rate": 4.065219902796953e-06,
"logits/chosen": -21.535301208496094,
"logits/rejected": -21.487293243408203,
"logps/chosen": -1.3686919212341309,
"logps/rejected": -1.5178884267807007,
"loss": 1.4414,
"odds_ratio_loss": 0.7275662422180176,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.13686920702457428,
"rewards/margins": 0.014919650740921497,
"rewards/rejected": -0.1517888456583023,
"sft_loss": 1.3686919212341309,
"step": 480
},
{
"epoch": 0.871304734385419,
"grad_norm": 1.1600650548934937,
"learning_rate": 4.028624486874608e-06,
"logits/chosen": -21.398052215576172,
"logits/rejected": -21.58942222595215,
"logps/chosen": -1.3973881006240845,
"logps/rejected": -1.5505540370941162,
"loss": 1.4689,
"odds_ratio_loss": 0.7148123383522034,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.1397387981414795,
"rewards/margins": 0.015316602773964405,
"rewards/rejected": -0.15505541861057281,
"sft_loss": 1.3973881006240845,
"step": 490
},
{
"epoch": 0.8890864636585908,
"grad_norm": 1.584820032119751,
"learning_rate": 3.99149827895177e-06,
"logits/chosen": -21.60881996154785,
"logits/rejected": -21.396835327148438,
"logps/chosen": -1.4577990770339966,
"logps/rejected": -1.5503555536270142,
"loss": 1.5296,
"odds_ratio_loss": 0.7181479334831238,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.14577992260456085,
"rewards/margins": 0.009255652315914631,
"rewards/rejected": -0.15503555536270142,
"sft_loss": 1.4577990770339966,
"step": 500
},
{
"epoch": 0.8890864636585908,
"eval_logits/chosen": -21.487462997436523,
"eval_logits/rejected": -21.574512481689453,
"eval_logps/chosen": -1.3780959844589233,
"eval_logps/rejected": -1.5480619668960571,
"eval_loss": 1.4461547136306763,
"eval_odds_ratio_loss": 0.6805880665779114,
"eval_rewards/accuracies": 0.546999990940094,
"eval_rewards/chosen": -0.13780958950519562,
"eval_rewards/margins": 0.016996610909700394,
"eval_rewards/rejected": -0.1548061966896057,
"eval_runtime": 80.0397,
"eval_samples_per_second": 12.494,
"eval_sft_loss": 1.3780959844589233,
"eval_steps_per_second": 6.247,
"step": 500
},
{
"epoch": 0.9068681929317626,
"grad_norm": 2.9641082286834717,
"learning_rate": 3.953854170549114e-06,
"logits/chosen": -21.522262573242188,
"logits/rejected": -21.48137092590332,
"logps/chosen": -1.3978930711746216,
"logps/rejected": -1.4638049602508545,
"loss": 1.4702,
"odds_ratio_loss": 0.722897469997406,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.13978929817676544,
"rewards/margins": 0.006591203156858683,
"rewards/rejected": -0.1463804990053177,
"sft_loss": 1.3978930711746216,
"step": 510
},
{
"epoch": 0.9246499222049345,
"grad_norm": 1.988604187965393,
"learning_rate": 3.91570523302051e-06,
"logits/chosen": -21.512929916381836,
"logits/rejected": -21.350711822509766,
"logps/chosen": -1.4139622449874878,
"logps/rejected": -1.502074122428894,
"loss": 1.4897,
"odds_ratio_loss": 0.7573299407958984,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.14139625430107117,
"rewards/margins": 0.008811171166598797,
"rewards/rejected": -0.15020740032196045,
"sft_loss": 1.4139622449874878,
"step": 520
},
{
"epoch": 0.9424316514781063,
"grad_norm": 1.4567950963974,
"learning_rate": 3.8770647130141996e-06,
"logits/chosen": -21.612693786621094,
"logits/rejected": -21.457687377929688,
"logps/chosen": -1.3569138050079346,
"logps/rejected": -1.5465893745422363,
"loss": 1.4271,
"odds_ratio_loss": 0.7022345662117004,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.13569141924381256,
"rewards/margins": 0.01896754465997219,
"rewards/rejected": -0.1546589434146881,
"sft_loss": 1.3569138050079346,
"step": 530
},
{
"epoch": 0.960213380751278,
"grad_norm": 1.3677376508712769,
"learning_rate": 3.837946027873086e-06,
"logits/chosen": -21.51246452331543,
"logits/rejected": -21.367631912231445,
"logps/chosen": -1.4506080150604248,
"logps/rejected": -1.578880786895752,
"loss": 1.5258,
"odds_ratio_loss": 0.7515760660171509,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.14506080746650696,
"rewards/margins": 0.012827281840145588,
"rewards/rejected": -0.15788806974887848,
"sft_loss": 1.4506080150604248,
"step": 540
},
{
"epoch": 0.9779951100244498,
"grad_norm": 2.220174789428711,
"learning_rate": 3.7983627609757713e-06,
"logits/chosen": -21.598114013671875,
"logits/rejected": -21.58673095703125,
"logps/chosen": -1.4242851734161377,
"logps/rejected": -1.521756887435913,
"loss": 1.495,
"odds_ratio_loss": 0.7070504426956177,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.14242851734161377,
"rewards/margins": 0.009747164323925972,
"rewards/rejected": -0.1521756947040558,
"sft_loss": 1.4242851734161377,
"step": 550
},
{
"epoch": 0.9957768392976217,
"grad_norm": 4.630890369415283,
"learning_rate": 3.758328657019924e-06,
"logits/chosen": -21.449283599853516,
"logits/rejected": -21.295719146728516,
"logps/chosen": -1.3235647678375244,
"logps/rejected": -1.4640743732452393,
"loss": 1.394,
"odds_ratio_loss": 0.7047213315963745,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.1323564499616623,
"rewards/margins": 0.014050972647964954,
"rewards/rejected": -0.14640744030475616,
"sft_loss": 1.3235647678375244,
"step": 560
},
{
"epoch": 1.0135585685707935,
"grad_norm": 4.50676155090332,
"learning_rate": 3.717857617249642e-06,
"logits/chosen": -21.255306243896484,
"logits/rejected": -21.378076553344727,
"logps/chosen": -1.4302732944488525,
"logps/rejected": -1.5925706624984741,
"loss": 1.5037,
"odds_ratio_loss": 0.7346171140670776,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.14302733540534973,
"rewards/margins": 0.016229737550020218,
"rewards/rejected": -0.15925706923007965,
"sft_loss": 1.4302732944488525,
"step": 570
},
{
"epoch": 1.0313402978439654,
"grad_norm": 1.0999839305877686,
"learning_rate": 3.6769636946284543e-06,
"logits/chosen": -21.335865020751953,
"logits/rejected": -21.204635620117188,
"logps/chosen": -1.2982518672943115,
"logps/rejected": -1.4139636754989624,
"loss": 1.3699,
"odds_ratio_loss": 0.7168216705322266,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -0.12982520461082458,
"rewards/margins": 0.011571166105568409,
"rewards/rejected": -0.14139637351036072,
"sft_loss": 1.2982518672943115,
"step": 580
},
{
"epoch": 1.049122027117137,
"grad_norm": 2.193345069885254,
"learning_rate": 3.6356610889596355e-06,
"logits/chosen": -21.441791534423828,
"logits/rejected": -21.434829711914062,
"logps/chosen": -1.371517539024353,
"logps/rejected": -1.4539680480957031,
"loss": 1.444,
"odds_ratio_loss": 0.724626362323761,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.13715174794197083,
"rewards/margins": 0.008245043456554413,
"rewards/rejected": -0.14539679884910583,
"sft_loss": 1.371517539024353,
"step": 590
},
{
"epoch": 1.066903756390309,
"grad_norm": 1.4549708366394043,
"learning_rate": 3.593964141955541e-06,
"logits/chosen": -21.462820053100586,
"logits/rejected": -21.234458923339844,
"logps/chosen": -1.355276107788086,
"logps/rejected": -1.4370988607406616,
"loss": 1.4288,
"odds_ratio_loss": 0.7349004149436951,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.1355275958776474,
"rewards/margins": 0.008182285353541374,
"rewards/rejected": -0.14370988309383392,
"sft_loss": 1.355276107788086,
"step": 600
},
{
"epoch": 1.0846854856634809,
"grad_norm": 1.5671168565750122,
"learning_rate": 3.5518873322576573e-06,
"logits/chosen": -20.91110610961914,
"logits/rejected": -21.30324935913086,
"logps/chosen": -1.3413856029510498,
"logps/rejected": -1.4344289302825928,
"loss": 1.4148,
"odds_ratio_loss": 0.7339103817939758,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": -0.1341385543346405,
"rewards/margins": 0.009304327890276909,
"rewards/rejected": -0.14344289898872375,
"sft_loss": 1.3413856029510498,
"step": 610
},
{
"epoch": 1.1024672149366526,
"grad_norm": 1.315266728401184,
"learning_rate": 3.5094452704091143e-06,
"logits/chosen": -21.287628173828125,
"logits/rejected": -21.211769104003906,
"logps/chosen": -1.3973969221115112,
"logps/rejected": -1.5142686367034912,
"loss": 1.4697,
"odds_ratio_loss": 0.7228156924247742,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.13973967730998993,
"rewards/margins": 0.01168716698884964,
"rewards/rejected": -0.15142686665058136,
"sft_loss": 1.3973969221115112,
"step": 620
},
{
"epoch": 1.1202489442098245,
"grad_norm": 3.814415693283081,
"learning_rate": 3.46665269378139e-06,
"logits/chosen": -21.241634368896484,
"logits/rejected": -21.107349395751953,
"logps/chosen": -1.4169210195541382,
"logps/rejected": -1.4841772317886353,
"loss": 1.4919,
"odds_ratio_loss": 0.7493141889572144,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.1416921317577362,
"rewards/margins": 0.006725601851940155,
"rewards/rejected": -0.14841774106025696,
"sft_loss": 1.4169210195541382,
"step": 630
},
{
"epoch": 1.1380306734829961,
"grad_norm": 5.05172872543335,
"learning_rate": 3.4235244614569794e-06,
"logits/chosen": -21.426654815673828,
"logits/rejected": -21.443878173828125,
"logps/chosen": -1.452530026435852,
"logps/rejected": -1.5365841388702393,
"loss": 1.528,
"odds_ratio_loss": 0.7546505928039551,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.1452530175447464,
"rewards/margins": 0.00840541161596775,
"rewards/rejected": -0.1536584198474884,
"sft_loss": 1.452530026435852,
"step": 640
},
{
"epoch": 1.155812402756168,
"grad_norm": 1.0250731706619263,
"learning_rate": 3.3800755490698008e-06,
"logits/chosen": -21.588850021362305,
"logits/rejected": -21.425325393676758,
"logps/chosen": -1.321417212486267,
"logps/rejected": -1.539794921875,
"loss": 1.3862,
"odds_ratio_loss": 0.6476024985313416,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.13214172422885895,
"rewards/margins": 0.021837763488292694,
"rewards/rejected": -0.15397948026657104,
"sft_loss": 1.321417212486267,
"step": 650
},
{
"epoch": 1.17359413202934,
"grad_norm": 1.5032236576080322,
"learning_rate": 3.3363210436051287e-06,
"logits/chosen": -21.39287567138672,
"logits/rejected": -21.30692481994629,
"logps/chosen": -1.409549593925476,
"logps/rejected": -1.5230066776275635,
"loss": 1.4823,
"odds_ratio_loss": 0.7271451950073242,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.14095497131347656,
"rewards/margins": 0.011345705017447472,
"rewards/rejected": -0.15230068564414978,
"sft_loss": 1.409549593925476,
"step": 660
},
{
"epoch": 1.1913758613025116,
"grad_norm": 1.4157321453094482,
"learning_rate": 3.292276138160867e-06,
"logits/chosen": -21.29572105407715,
"logits/rejected": -21.25027084350586,
"logps/chosen": -1.4160873889923096,
"logps/rejected": -1.496361255645752,
"loss": 1.491,
"odds_ratio_loss": 0.749149739742279,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.14160871505737305,
"rewards/margins": 0.00802740640938282,
"rewards/rejected": -0.14963611960411072,
"sft_loss": 1.4160873889923096,
"step": 670
},
{
"epoch": 1.2091575905756835,
"grad_norm": 1.573752522468567,
"learning_rate": 3.2479561266719694e-06,
"logits/chosen": -21.345748901367188,
"logits/rejected": -21.232250213623047,
"logps/chosen": -1.3891535997390747,
"logps/rejected": -1.5042526721954346,
"loss": 1.4603,
"odds_ratio_loss": 0.7116107940673828,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.13891534507274628,
"rewards/margins": 0.011509908363223076,
"rewards/rejected": -0.1504252701997757,
"sft_loss": 1.3891535997390747,
"step": 680
},
{
"epoch": 1.2269393198488552,
"grad_norm": 2.5362017154693604,
"learning_rate": 3.2033763985998533e-06,
"logits/chosen": -21.208703994750977,
"logits/rejected": -21.204181671142578,
"logps/chosen": -1.3326551914215088,
"logps/rejected": -1.5722543001174927,
"loss": 1.3975,
"odds_ratio_loss": 0.6482537984848022,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.13326552510261536,
"rewards/margins": 0.02395990863442421,
"rewards/rejected": -0.15722543001174927,
"sft_loss": 1.3326551914215088,
"step": 690
},
{
"epoch": 1.244721049122027,
"grad_norm": 1.910599946975708,
"learning_rate": 3.1585524335886335e-06,
"logits/chosen": -21.477584838867188,
"logits/rejected": -21.243457794189453,
"logps/chosen": -1.299839735031128,
"logps/rejected": -1.449894666671753,
"loss": 1.3692,
"odds_ratio_loss": 0.6940584182739258,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.12998396158218384,
"rewards/margins": 0.015005489811301231,
"rewards/rejected": -0.14498946070671082,
"sft_loss": 1.299839735031128,
"step": 700
},
{
"epoch": 1.262502778395199,
"grad_norm": 2.3555686473846436,
"learning_rate": 3.1134997960900536e-06,
"logits/chosen": -20.757158279418945,
"logits/rejected": -20.784774780273438,
"logps/chosen": -1.2707315683364868,
"logps/rejected": -1.538629174232483,
"loss": 1.334,
"odds_ratio_loss": 0.6324664950370789,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.12707316875457764,
"rewards/margins": 0.026789745315909386,
"rewards/rejected": -0.15386290848255157,
"sft_loss": 1.2707315683364868,
"step": 710
},
{
"epoch": 1.2802845076683709,
"grad_norm": 1.596251130104065,
"learning_rate": 3.0682341299589583e-06,
"logits/chosen": -20.7999324798584,
"logits/rejected": -20.802942276000977,
"logps/chosen": -1.3453733921051025,
"logps/rejected": -1.4210965633392334,
"loss": 1.4194,
"odds_ratio_loss": 0.7405422329902649,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": -0.13453733921051025,
"rewards/margins": 0.0075723156332969666,
"rewards/rejected": -0.14210966229438782,
"sft_loss": 1.3453733921051025,
"step": 720
},
{
"epoch": 1.2980662369415426,
"grad_norm": 9.634610176086426,
"learning_rate": 3.022771153021201e-06,
"logits/chosen": -21.071128845214844,
"logits/rejected": -21.114444732666016,
"logps/chosen": -1.3551054000854492,
"logps/rejected": -1.5129811763763428,
"loss": 1.4248,
"odds_ratio_loss": 0.6970704197883606,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.13551053404808044,
"rewards/margins": 0.015787573531270027,
"rewards/rejected": -0.15129812061786652,
"sft_loss": 1.3551054000854492,
"step": 730
},
{
"epoch": 1.3158479662147144,
"grad_norm": 1.6211514472961426,
"learning_rate": 2.9771266516158625e-06,
"logits/chosen": -20.895437240600586,
"logits/rejected": -21.04778480529785,
"logps/chosen": -1.3368757963180542,
"logps/rejected": -1.4986876249313354,
"loss": 1.4062,
"odds_ratio_loss": 0.6928601264953613,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.1336875855922699,
"rewards/margins": 0.016181182116270065,
"rewards/rejected": -0.14986875653266907,
"sft_loss": 1.3368757963180542,
"step": 740
},
{
"epoch": 1.3336296954878861,
"grad_norm": 1.4428294897079468,
"learning_rate": 2.9313164751136802e-06,
"logits/chosen": -20.872339248657227,
"logits/rejected": -21.019441604614258,
"logps/chosen": -1.4122194051742554,
"logps/rejected": -1.487066626548767,
"loss": 1.4842,
"odds_ratio_loss": 0.7194846272468567,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.14122194051742554,
"rewards/margins": 0.007484720554202795,
"rewards/rejected": -0.14870667457580566,
"sft_loss": 1.4122194051742554,
"step": 750
},
{
"epoch": 1.351411424761058,
"grad_norm": 2.2369892597198486,
"learning_rate": 2.8853565304135956e-06,
"logits/chosen": -21.462568283081055,
"logits/rejected": -21.10171127319336,
"logps/chosen": -1.352461576461792,
"logps/rejected": -1.3995507955551147,
"loss": 1.4271,
"odds_ratio_loss": 0.7464177012443542,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": -0.1352461576461792,
"rewards/margins": 0.004708918742835522,
"rewards/rejected": -0.1399550884962082,
"sft_loss": 1.352461576461792,
"step": 760
},
{
"epoch": 1.36919315403423,
"grad_norm": 4.38085412979126,
"learning_rate": 2.839262776419313e-06,
"logits/chosen": -20.986604690551758,
"logits/rejected": -20.851150512695312,
"logps/chosen": -1.3386285305023193,
"logps/rejected": -1.5874344110488892,
"loss": 1.4054,
"odds_ratio_loss": 0.6678156852722168,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.13386288285255432,
"rewards/margins": 0.024880561977624893,
"rewards/rejected": -0.15874342620372772,
"sft_loss": 1.3386285305023193,
"step": 770
},
{
"epoch": 1.3869748833074016,
"grad_norm": 1.5637321472167969,
"learning_rate": 2.793051218497817e-06,
"logits/chosen": -21.250728607177734,
"logits/rejected": -21.10789680480957,
"logps/chosen": -1.3795894384384155,
"logps/rejected": -1.4174426794052124,
"loss": 1.4542,
"odds_ratio_loss": 0.7466022968292236,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.13795895874500275,
"rewards/margins": 0.003785322653129697,
"rewards/rejected": -0.14174428582191467,
"sft_loss": 1.3795894384384155,
"step": 780
},
{
"epoch": 1.4047566125805735,
"grad_norm": 1.276485800743103,
"learning_rate": 2.7467379029217437e-06,
"logits/chosen": -20.930208206176758,
"logits/rejected": -20.79922103881836,
"logps/chosen": -1.3249105215072632,
"logps/rejected": -1.5102876424789429,
"loss": 1.3942,
"odds_ratio_loss": 0.6923983693122864,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.1324910670518875,
"rewards/margins": 0.018537694588303566,
"rewards/rejected": -0.15102875232696533,
"sft_loss": 1.3249105215072632,
"step": 790
},
{
"epoch": 1.4225383418537452,
"grad_norm": 1.1495212316513062,
"learning_rate": 2.7003389112975546e-06,
"logits/chosen": -21.19894027709961,
"logits/rejected": -21.32394790649414,
"logps/chosen": -1.3503518104553223,
"logps/rejected": -1.484006404876709,
"loss": 1.4195,
"odds_ratio_loss": 0.6918057203292847,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.1350351870059967,
"rewards/margins": 0.013365456834435463,
"rewards/rejected": -0.14840063452720642,
"sft_loss": 1.3503518104553223,
"step": 800
},
{
"epoch": 1.440320071126917,
"grad_norm": 4.020893573760986,
"learning_rate": 2.653870354981437e-06,
"logits/chosen": -21.07791519165039,
"logits/rejected": -20.885608673095703,
"logps/chosen": -1.2470946311950684,
"logps/rejected": -1.3942331075668335,
"loss": 1.3155,
"odds_ratio_loss": 0.6838669776916504,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.12470944970846176,
"rewards/margins": 0.014713853597640991,
"rewards/rejected": -0.13942332565784454,
"sft_loss": 1.2470946311950684,
"step": 810
},
{
"epoch": 1.458101800400089,
"grad_norm": 3.656785011291504,
"learning_rate": 2.6073483694848777e-06,
"logits/chosen": -20.693532943725586,
"logits/rejected": -21.043460845947266,
"logps/chosen": -1.288588285446167,
"logps/rejected": -1.4409494400024414,
"loss": 1.3582,
"odds_ratio_loss": 0.6965524554252625,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.12885884940624237,
"rewards/margins": 0.015236112289130688,
"rewards/rejected": -0.14409494400024414,
"sft_loss": 1.288588285446167,
"step": 820
},
{
"epoch": 1.4758835296732609,
"grad_norm": 1.309704303741455,
"learning_rate": 2.560789108871847e-06,
"logits/chosen": -20.856311798095703,
"logits/rejected": -20.888708114624023,
"logps/chosen": -1.3494679927825928,
"logps/rejected": -1.5807578563690186,
"loss": 1.4162,
"odds_ratio_loss": 0.6674301028251648,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.1349468231201172,
"rewards/margins": 0.02312898077070713,
"rewards/rejected": -0.15807577967643738,
"sft_loss": 1.3494679927825928,
"step": 830
},
{
"epoch": 1.4936652589464325,
"grad_norm": 5.437036037445068,
"learning_rate": 2.514208740149544e-06,
"logits/chosen": -21.045909881591797,
"logits/rejected": -21.26214599609375,
"logps/chosen": -1.4145755767822266,
"logps/rejected": -1.549298644065857,
"loss": 1.4864,
"odds_ratio_loss": 0.7186577320098877,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.14145755767822266,
"rewards/margins": 0.013472315855324268,
"rewards/rejected": -0.15492987632751465,
"sft_loss": 1.4145755767822266,
"step": 840
},
{
"epoch": 1.5114469882196042,
"grad_norm": 2.7086102962493896,
"learning_rate": 2.46762343765464e-06,
"logits/chosen": -21.045820236206055,
"logits/rejected": -21.116756439208984,
"logps/chosen": -1.4063694477081299,
"logps/rejected": -1.5858089923858643,
"loss": 1.4749,
"odds_ratio_loss": 0.6853106021881104,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.14063693583011627,
"rewards/margins": 0.01794394478201866,
"rewards/rejected": -0.15858088433742523,
"sft_loss": 1.4063694477081299,
"step": 850
},
{
"epoch": 1.5292287174927761,
"grad_norm": 3.8162646293640137,
"learning_rate": 2.4210493774369903e-06,
"logits/chosen": -20.788593292236328,
"logits/rejected": -20.716583251953125,
"logps/chosen": -1.406285285949707,
"logps/rejected": -1.5719993114471436,
"loss": 1.4773,
"odds_ratio_loss": 0.7099908590316772,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.14062853157520294,
"rewards/margins": 0.016571396961808205,
"rewards/rejected": -0.1571999341249466,
"sft_loss": 1.406285285949707,
"step": 860
},
{
"epoch": 1.547010446765948,
"grad_norm": 1.2074657678604126,
"learning_rate": 2.374502731642732e-06,
"logits/chosen": -20.910995483398438,
"logits/rejected": -20.997020721435547,
"logps/chosen": -1.3468477725982666,
"logps/rejected": -1.490969181060791,
"loss": 1.4171,
"odds_ratio_loss": 0.7025480270385742,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.13468477129936218,
"rewards/margins": 0.014412140473723412,
"rewards/rejected": -0.14909692108631134,
"sft_loss": 1.3468477725982666,
"step": 870
},
{
"epoch": 1.56479217603912,
"grad_norm": 1.6771084070205688,
"learning_rate": 2.3279996628987556e-06,
"logits/chosen": -21.090503692626953,
"logits/rejected": -21.15408706665039,
"logps/chosen": -1.3241709470748901,
"logps/rejected": -1.4298092126846313,
"loss": 1.397,
"odds_ratio_loss": 0.7280608415603638,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.13241711258888245,
"rewards/margins": 0.010563802905380726,
"rewards/rejected": -0.1429809182882309,
"sft_loss": 1.3241709470748901,
"step": 880
},
{
"epoch": 1.5825739053122916,
"grad_norm": 2.092092514038086,
"learning_rate": 2.281556318700474e-06,
"logits/chosen": -20.86192512512207,
"logits/rejected": -21.044658660888672,
"logps/chosen": -1.3072993755340576,
"logps/rejected": -1.3738138675689697,
"loss": 1.3822,
"odds_ratio_loss": 0.7485288381576538,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.13072994351387024,
"rewards/margins": 0.006651435978710651,
"rewards/rejected": -0.13738137483596802,
"sft_loss": 1.3072993755340576,
"step": 890
},
{
"epoch": 1.6003556345854635,
"grad_norm": 6.660823822021484,
"learning_rate": 2.2351888258048408e-06,
"logits/chosen": -20.55089569091797,
"logits/rejected": -20.74386978149414,
"logps/chosen": -1.3101674318313599,
"logps/rejected": -1.4409325122833252,
"loss": 1.3826,
"odds_ratio_loss": 0.7239800691604614,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.13101674616336823,
"rewards/margins": 0.013076506555080414,
"rewards/rejected": -0.14409324526786804,
"sft_loss": 1.3101674318313599,
"step": 900
},
{
"epoch": 1.6181373638586352,
"grad_norm": 1.3889552354812622,
"learning_rate": 2.188913284630584e-06,
"logits/chosen": -21.00895118713379,
"logits/rejected": -21.11439323425293,
"logps/chosen": -1.3723797798156738,
"logps/rejected": -1.4007512331008911,
"loss": 1.449,
"odds_ratio_loss": 0.7658642530441284,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.13723799586296082,
"rewards/margins": 0.0028371470980346203,
"rewards/rejected": -0.14007511734962463,
"sft_loss": 1.3723797798156738,
"step": 910
},
{
"epoch": 1.635919093131807,
"grad_norm": 4.06219482421875,
"learning_rate": 2.1427457636675652e-06,
"logits/chosen": -21.082805633544922,
"logits/rejected": -21.207538604736328,
"logps/chosen": -1.336096167564392,
"logps/rejected": -1.4373667240142822,
"loss": 1.4096,
"odds_ratio_loss": 0.7348427176475525,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": -0.13360963761806488,
"rewards/margins": 0.010127037763595581,
"rewards/rejected": -0.14373667538166046,
"sft_loss": 1.336096167564392,
"step": 920
},
{
"epoch": 1.653700822404979,
"grad_norm": 1.464863657951355,
"learning_rate": 2.096702293897247e-06,
"logits/chosen": -20.881576538085938,
"logits/rejected": -20.812564849853516,
"logps/chosen": -1.3259438276290894,
"logps/rejected": -1.5576345920562744,
"loss": 1.3925,
"odds_ratio_loss": 0.6658841371536255,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.13259439170360565,
"rewards/margins": 0.02316909097135067,
"rewards/rejected": -0.15576346218585968,
"sft_loss": 1.3259438276290894,
"step": 930
},
{
"epoch": 1.6714825516781509,
"grad_norm": 2.923940420150757,
"learning_rate": 2.0507988632261672e-06,
"logits/chosen": -20.792316436767578,
"logits/rejected": -20.86935043334961,
"logps/chosen": -1.3512170314788818,
"logps/rejected": -1.4976381063461304,
"loss": 1.4213,
"odds_ratio_loss": 0.7008249163627625,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.1351216733455658,
"rewards/margins": 0.014642128720879555,
"rewards/rejected": -0.14976383745670319,
"sft_loss": 1.3512170314788818,
"step": 940
},
{
"epoch": 1.6892642809513225,
"grad_norm": 3.5238471031188965,
"learning_rate": 2.005051410934382e-06,
"logits/chosen": -20.95963478088379,
"logits/rejected": -20.97479248046875,
"logps/chosen": -1.4458208084106445,
"logps/rejected": -1.5041887760162354,
"loss": 1.5203,
"odds_ratio_loss": 0.7446683645248413,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.14458207786083221,
"rewards/margins": 0.005836788564920425,
"rewards/rejected": -0.15041887760162354,
"sft_loss": 1.4458208084106445,
"step": 950
},
{
"epoch": 1.7070460102244942,
"grad_norm": 2.6721088886260986,
"learning_rate": 1.9594758221407843e-06,
"logits/chosen": -20.884212493896484,
"logits/rejected": -20.890071868896484,
"logps/chosen": -1.3164139986038208,
"logps/rejected": -1.506830096244812,
"loss": 1.3826,
"odds_ratio_loss": 0.6615261435508728,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.13164140284061432,
"rewards/margins": 0.019041623920202255,
"rewards/rejected": -0.15068301558494568,
"sft_loss": 1.3164139986038208,
"step": 960
},
{
"epoch": 1.724827739497666,
"grad_norm": 2.1806442737579346,
"learning_rate": 1.9140879222872408e-06,
"logits/chosen": -20.64748191833496,
"logits/rejected": -20.80613136291504,
"logps/chosen": -1.3217017650604248,
"logps/rejected": -1.4095408916473389,
"loss": 1.3952,
"odds_ratio_loss": 0.7351614236831665,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.1321701854467392,
"rewards/margins": 0.008783898316323757,
"rewards/rejected": -0.14095407724380493,
"sft_loss": 1.3217017650604248,
"step": 970
},
{
"epoch": 1.742609468770838,
"grad_norm": 1.7276735305786133,
"learning_rate": 1.8689034716434346e-06,
"logits/chosen": -21.096982955932617,
"logits/rejected": -21.016551971435547,
"logps/chosen": -1.3933743238449097,
"logps/rejected": -1.4783251285552979,
"loss": 1.4665,
"odds_ratio_loss": 0.7310749292373657,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.1393374353647232,
"rewards/margins": 0.008495080284774303,
"rewards/rejected": -0.14783251285552979,
"sft_loss": 1.3933743238449097,
"step": 980
},
{
"epoch": 1.76039119804401,
"grad_norm": 1.8123304843902588,
"learning_rate": 1.8239381598343576e-06,
"logits/chosen": -20.750640869140625,
"logits/rejected": -20.75037956237793,
"logps/chosen": -1.3481253385543823,
"logps/rejected": -1.4228088855743408,
"loss": 1.4223,
"odds_ratio_loss": 0.7416225671768188,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.13481254875659943,
"rewards/margins": 0.0074683367274701595,
"rewards/rejected": -0.14228087663650513,
"sft_loss": 1.3481253385543823,
"step": 990
},
{
"epoch": 1.7781729273171816,
"grad_norm": 3.5161044597625732,
"learning_rate": 1.779207600392312e-06,
"logits/chosen": -21.101184844970703,
"logits/rejected": -21.04250717163086,
"logps/chosen": -1.4057555198669434,
"logps/rejected": -1.5241641998291016,
"loss": 1.4761,
"odds_ratio_loss": 0.7033491134643555,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.1405755579471588,
"rewards/margins": 0.011840855702757835,
"rewards/rejected": -0.1524164229631424,
"sft_loss": 1.4057555198669434,
"step": 1000
},
{
"epoch": 1.7781729273171816,
"eval_logits/chosen": -20.803815841674805,
"eval_logits/rejected": -20.891420364379883,
"eval_logps/chosen": -1.3395137786865234,
"eval_logps/rejected": -1.5095441341400146,
"eval_loss": 1.407221794128418,
"eval_odds_ratio_loss": 0.6770801544189453,
"eval_rewards/accuracies": 0.5350000262260437,
"eval_rewards/chosen": -0.1339513659477234,
"eval_rewards/margins": 0.017003033310174942,
"eval_rewards/rejected": -0.15095441043376923,
"eval_runtime": 80.0538,
"eval_samples_per_second": 12.492,
"eval_sft_loss": 1.3395137786865234,
"eval_steps_per_second": 6.246,
"step": 1000
},
{
"epoch": 1.7959546565903532,
"grad_norm": 3.0343945026397705,
"learning_rate": 1.7347273253353552e-06,
"logits/chosen": -20.704559326171875,
"logits/rejected": -20.68727684020996,
"logps/chosen": -1.314007043838501,
"logps/rejected": -1.4146376848220825,
"loss": 1.389,
"odds_ratio_loss": 0.7499723434448242,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.1314007043838501,
"rewards/margins": 0.01006306428462267,
"rewards/rejected": -0.14146378636360168,
"sft_loss": 1.314007043838501,
"step": 1010
},
{
"epoch": 1.8137363858635251,
"grad_norm": 6.905886650085449,
"learning_rate": 1.690512779774029e-06,
"logits/chosen": -20.81467056274414,
"logits/rejected": -20.834705352783203,
"logps/chosen": -1.4023054838180542,
"logps/rejected": -1.652772307395935,
"loss": 1.4691,
"odds_ratio_loss": 0.6682060956954956,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.14023055136203766,
"rewards/margins": 0.025046680122613907,
"rewards/rejected": -0.16527722775936127,
"sft_loss": 1.4023054838180542,
"step": 1020
},
{
"epoch": 1.831518115136697,
"grad_norm": 1.7513582706451416,
"learning_rate": 1.6465793165482838e-06,
"logits/chosen": -20.86380386352539,
"logits/rejected": -20.82488441467285,
"logps/chosen": -1.2588412761688232,
"logps/rejected": -1.4600279331207275,
"loss": 1.3236,
"odds_ratio_loss": 0.6471126079559326,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.12588414549827576,
"rewards/margins": 0.02011866495013237,
"rewards/rejected": -0.14600279927253723,
"sft_loss": 1.2588412761688232,
"step": 1030
},
{
"epoch": 1.849299844409869,
"grad_norm": 2.2100415229797363,
"learning_rate": 1.6029421908964305e-06,
"logits/chosen": -20.778493881225586,
"logits/rejected": -20.64494514465332,
"logps/chosen": -1.3084795475006104,
"logps/rejected": -1.622815728187561,
"loss": 1.375,
"odds_ratio_loss": 0.664787232875824,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.1308479607105255,
"rewards/margins": 0.03143361583352089,
"rewards/rejected": -0.1622815579175949,
"sft_loss": 1.3084795475006104,
"step": 1040
},
{
"epoch": 1.8670815736830408,
"grad_norm": 4.5934367179870605,
"learning_rate": 1.559616555157985e-06,
"logits/chosen": -21.22179412841797,
"logits/rejected": -20.979957580566406,
"logps/chosen": -1.3719347715377808,
"logps/rejected": -1.5128790140151978,
"loss": 1.4452,
"odds_ratio_loss": 0.7331027388572693,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.1371934711933136,
"rewards/margins": 0.014094437472522259,
"rewards/rejected": -0.15128789842128754,
"sft_loss": 1.3719347715377808,
"step": 1050
},
{
"epoch": 1.8848633029562125,
"grad_norm": 1.7163333892822266,
"learning_rate": 1.516617453512252e-06,
"logits/chosen": -21.102188110351562,
"logits/rejected": -21.131221771240234,
"logps/chosen": -1.4223716259002686,
"logps/rejected": -1.4797694683074951,
"loss": 1.498,
"odds_ratio_loss": 0.7560666799545288,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.14223715662956238,
"rewards/margins": 0.005739795975387096,
"rewards/rejected": -0.14797696471214294,
"sft_loss": 1.4223716259002686,
"step": 1060
},
{
"epoch": 1.9026450322293842,
"grad_norm": 9.351452827453613,
"learning_rate": 1.473959816754449e-06,
"logits/chosen": -20.615371704101562,
"logits/rejected": -20.649810791015625,
"logps/chosen": -1.3047014474868774,
"logps/rejected": -1.3762633800506592,
"loss": 1.3786,
"odds_ratio_loss": 0.7393638491630554,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.1304701417684555,
"rewards/margins": 0.00715619046241045,
"rewards/rejected": -0.13762633502483368,
"sft_loss": 1.3047014474868774,
"step": 1070
},
{
"epoch": 1.920426761502556,
"grad_norm": 2.190560817718506,
"learning_rate": 1.4316584571112213e-06,
"logits/chosen": -21.255840301513672,
"logits/rejected": -21.131498336791992,
"logps/chosen": -1.3643953800201416,
"logps/rejected": -1.461114525794983,
"loss": 1.4364,
"odds_ratio_loss": 0.7197447419166565,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.13643954694271088,
"rewards/margins": 0.009671924635767937,
"rewards/rejected": -0.14611145853996277,
"sft_loss": 1.3643953800201416,
"step": 1080
},
{
"epoch": 1.938208490775728,
"grad_norm": 2.542182683944702,
"learning_rate": 1.389728063097306e-06,
"logits/chosen": -20.93314552307129,
"logits/rejected": -20.863218307495117,
"logps/chosen": -1.389034390449524,
"logps/rejected": -1.5989328622817993,
"loss": 1.4573,
"odds_ratio_loss": 0.6827085614204407,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.1389034539461136,
"rewards/margins": 0.020989837124943733,
"rewards/rejected": -0.15989328920841217,
"sft_loss": 1.389034390449524,
"step": 1090
},
{
"epoch": 1.9559902200488999,
"grad_norm": 1.5650415420532227,
"learning_rate": 1.348183194415179e-06,
"logits/chosen": -20.95106315612793,
"logits/rejected": -20.61818504333496,
"logps/chosen": -1.323676347732544,
"logps/rejected": -1.5667550563812256,
"loss": 1.3885,
"odds_ratio_loss": 0.6487289071083069,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.13236764073371887,
"rewards/margins": 0.02430787682533264,
"rewards/rejected": -0.1566755324602127,
"sft_loss": 1.323676347732544,
"step": 1100
},
{
"epoch": 1.9737719493220716,
"grad_norm": 1.7203210592269897,
"learning_rate": 1.3070382768994015e-06,
"logits/chosen": -20.69628143310547,
"logits/rejected": -20.650815963745117,
"logps/chosen": -1.3079763650894165,
"logps/rejected": -1.449339747428894,
"loss": 1.3762,
"odds_ratio_loss": 0.6826270818710327,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.13079765439033508,
"rewards/margins": 0.014136332087218761,
"rewards/rejected": -0.14493396878242493,
"sft_loss": 1.3079763650894165,
"step": 1110
},
{
"epoch": 1.9915536785952432,
"grad_norm": 1.5598257780075073,
"learning_rate": 1.2663075975074746e-06,
"logits/chosen": -20.689380645751953,
"logits/rejected": -20.69732666015625,
"logps/chosen": -1.3402197360992432,
"logps/rejected": -1.503177285194397,
"loss": 1.4129,
"odds_ratio_loss": 0.7268449664115906,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.13402198255062103,
"rewards/margins": 0.016295749694108963,
"rewards/rejected": -0.1503177136182785,
"sft_loss": 1.3402197360992432,
"step": 1120
},
{
"epoch": 2.009335407868415,
"grad_norm": 5.007309436798096,
"learning_rate": 1.2260052993589034e-06,
"logits/chosen": -20.855276107788086,
"logits/rejected": -20.814468383789062,
"logps/chosen": -1.4246357679367065,
"logps/rejected": -1.4585391283035278,
"loss": 1.5014,
"odds_ratio_loss": 0.7673634886741638,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.1424635797739029,
"rewards/margins": 0.003390337573364377,
"rewards/rejected": -0.1458539217710495,
"sft_loss": 1.4246357679367065,
"step": 1130
},
{
"epoch": 2.027117137141587,
"grad_norm": 1.2563971281051636,
"learning_rate": 1.1861453768242099e-06,
"logits/chosen": -20.794506072998047,
"logits/rejected": -20.795894622802734,
"logps/chosen": -1.2917953729629517,
"logps/rejected": -1.483782172203064,
"loss": 1.3577,
"odds_ratio_loss": 0.6590424180030823,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.12917952239513397,
"rewards/margins": 0.019198691472411156,
"rewards/rejected": -0.14837822318077087,
"sft_loss": 1.2917953729629517,
"step": 1140
},
{
"epoch": 2.044898866414759,
"grad_norm": 8.363728523254395,
"learning_rate": 1.1467416706655982e-06,
"logits/chosen": -20.971622467041016,
"logits/rejected": -21.218524932861328,
"logps/chosen": -1.418050765991211,
"logps/rejected": -1.580128788948059,
"loss": 1.4924,
"odds_ratio_loss": 0.7438761591911316,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.14180508255958557,
"rewards/margins": 0.016207797452807426,
"rewards/rejected": -0.15801288187503815,
"sft_loss": 1.418050765991211,
"step": 1150
},
{
"epoch": 2.062680595687931,
"grad_norm": 1.5622318983078003,
"learning_rate": 1.1078078632309559e-06,
"logits/chosen": -20.65304946899414,
"logits/rejected": -20.797122955322266,
"logps/chosen": -1.3177438974380493,
"logps/rejected": -1.4770663976669312,
"loss": 1.3849,
"odds_ratio_loss": 0.6711241006851196,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.1317743957042694,
"rewards/margins": 0.015932243317365646,
"rewards/rejected": -0.14770662784576416,
"sft_loss": 1.3177438974380493,
"step": 1160
},
{
"epoch": 2.0804623249611023,
"grad_norm": 1.6626743078231812,
"learning_rate": 1.0693574737028627e-06,
"logits/chosen": -20.749677658081055,
"logits/rejected": -20.718555450439453,
"logps/chosen": -1.354952096939087,
"logps/rejected": -1.4761542081832886,
"loss": 1.428,
"odds_ratio_loss": 0.73005211353302,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.13549521565437317,
"rewards/margins": 0.012120204977691174,
"rewards/rejected": -0.14761541783809662,
"sft_loss": 1.354952096939087,
"step": 1170
},
{
"epoch": 2.098244054234274,
"grad_norm": 3.4684457778930664,
"learning_rate": 1.0314038534042586e-06,
"logits/chosen": -20.948108673095703,
"logits/rejected": -20.716609954833984,
"logps/chosen": -1.2302569150924683,
"logps/rejected": -1.4216673374176025,
"loss": 1.2983,
"odds_ratio_loss": 0.6800249814987183,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.12302567809820175,
"rewards/margins": 0.019141051918268204,
"rewards/rejected": -0.14216673374176025,
"sft_loss": 1.2302569150924683,
"step": 1180
},
{
"epoch": 2.116025783507446,
"grad_norm": 1.7580640316009521,
"learning_rate": 9.939601811623946e-07,
"logits/chosen": -20.846065521240234,
"logits/rejected": -20.80862045288086,
"logps/chosen": -1.3318583965301514,
"logps/rejected": -1.4921131134033203,
"loss": 1.4031,
"odds_ratio_loss": 0.7127273678779602,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.13318583369255066,
"rewards/margins": 0.016025487333536148,
"rewards/rejected": -0.1492113173007965,
"sft_loss": 1.3318583965301514,
"step": 1190
},
{
"epoch": 2.133807512780618,
"grad_norm": 2.0461864471435547,
"learning_rate": 9.570394587326825e-07,
"logits/chosen": -21.051130294799805,
"logits/rejected": -20.864850997924805,
"logps/chosen": -1.322939395904541,
"logps/rejected": -1.5531421899795532,
"loss": 1.3893,
"odds_ratio_loss": 0.6637840867042542,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.1322939246892929,
"rewards/margins": 0.023020274937152863,
"rewards/rejected": -0.15531422197818756,
"sft_loss": 1.322939395904541,
"step": 1200
},
{
"epoch": 2.15158924205379,
"grad_norm": 1.2578119039535522,
"learning_rate": 9.206545062840302e-07,
"logits/chosen": -21.234752655029297,
"logits/rejected": -20.857492446899414,
"logps/chosen": -1.2849655151367188,
"logps/rejected": -1.4767402410507202,
"loss": 1.3511,
"odds_ratio_loss": 0.6610640287399292,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.12849654257297516,
"rewards/margins": 0.019177492707967758,
"rewards/rejected": -0.14767403900623322,
"sft_loss": 1.2849655151367188,
"step": 1210
},
{
"epoch": 2.1693709713269618,
"grad_norm": 1.5944854021072388,
"learning_rate": 8.848179579472285e-07,
"logits/chosen": -20.92203140258789,
"logits/rejected": -20.818485260009766,
"logps/chosen": -1.2799731492996216,
"logps/rejected": -1.326030969619751,
"loss": 1.3534,
"odds_ratio_loss": 0.7338781952857971,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.12799732387065887,
"rewards/margins": 0.0046057915315032005,
"rewards/rejected": -0.13260310888290405,
"sft_loss": 1.2799731492996216,
"step": 1220
},
{
"epoch": 2.1871527006001332,
"grad_norm": 3.263883352279663,
"learning_rate": 8.495422574279403e-07,
"logits/chosen": -20.327661514282227,
"logits/rejected": -20.28653907775879,
"logps/chosen": -1.262486219406128,
"logps/rejected": -1.4986459016799927,
"loss": 1.3277,
"odds_ratio_loss": 0.6521891951560974,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.12624862790107727,
"rewards/margins": 0.02361595258116722,
"rewards/rejected": -0.1498645842075348,
"sft_loss": 1.262486219406128,
"step": 1230
},
{
"epoch": 2.204934429873305,
"grad_norm": 2.0986313819885254,
"learning_rate": 8.148396536858063e-07,
"logits/chosen": -21.014957427978516,
"logits/rejected": -21.147602081298828,
"logps/chosen": -1.3925727605819702,
"logps/rejected": -1.589383840560913,
"loss": 1.463,
"odds_ratio_loss": 0.7045022249221802,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.1392572671175003,
"rewards/margins": 0.019681129604578018,
"rewards/rejected": -0.15893837809562683,
"sft_loss": 1.3925727605819702,
"step": 1240
},
{
"epoch": 2.222716159146477,
"grad_norm": 1.3406250476837158,
"learning_rate": 7.807221966811815e-07,
"logits/chosen": -20.607036590576172,
"logits/rejected": -20.66307830810547,
"logps/chosen": -1.2920827865600586,
"logps/rejected": -1.41164231300354,
"loss": 1.3638,
"odds_ratio_loss": 0.7169677019119263,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.1292082816362381,
"rewards/margins": 0.01195596344769001,
"rewards/rejected": -0.14116425812244415,
"sft_loss": 1.2920827865600586,
"step": 1250
},
{
"epoch": 2.240497888419649,
"grad_norm": 2.084696054458618,
"learning_rate": 7.47201733190962e-07,
"logits/chosen": -20.630435943603516,
"logits/rejected": -20.60986328125,
"logps/chosen": -1.2978394031524658,
"logps/rejected": -1.403597116470337,
"loss": 1.3682,
"odds_ratio_loss": 0.7040928602218628,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.12978394329547882,
"rewards/margins": 0.010575750842690468,
"rewards/rejected": -0.14035969972610474,
"sft_loss": 1.2978394031524658,
"step": 1260
},
{
"epoch": 2.258279617692821,
"grad_norm": 1.7839128971099854,
"learning_rate": 7.142899026949721e-07,
"logits/chosen": -20.951190948486328,
"logits/rejected": -20.935705184936523,
"logps/chosen": -1.320299744606018,
"logps/rejected": -1.4439074993133545,
"loss": 1.3892,
"odds_ratio_loss": 0.6885126233100891,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.13202998042106628,
"rewards/margins": 0.012360776774585247,
"rewards/rejected": -0.1443907469511032,
"sft_loss": 1.320299744606018,
"step": 1270
},
{
"epoch": 2.2760613469659923,
"grad_norm": 7.137161731719971,
"learning_rate": 6.819981333343273e-07,
"logits/chosen": -20.221033096313477,
"logits/rejected": -20.27283477783203,
"logps/chosen": -1.2987910509109497,
"logps/rejected": -1.4801701307296753,
"loss": 1.3676,
"odds_ratio_loss": 0.6884258985519409,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.12987910211086273,
"rewards/margins": 0.01813790202140808,
"rewards/rejected": -0.148017019033432,
"sft_loss": 1.2987910509109497,
"step": 1280
},
{
"epoch": 2.293843076239164,
"grad_norm": 2.513110876083374,
"learning_rate": 6.503376379431839e-07,
"logits/chosen": -20.69548797607422,
"logits/rejected": -20.705198287963867,
"logps/chosen": -1.4108153581619263,
"logps/rejected": -1.396875262260437,
"loss": 1.4867,
"odds_ratio_loss": 0.7589144110679626,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": -0.1410815417766571,
"rewards/margins": -0.0013940061908215284,
"rewards/rejected": -0.13968753814697266,
"sft_loss": 1.4108153581619263,
"step": 1290
},
{
"epoch": 2.311624805512336,
"grad_norm": 6.825961112976074,
"learning_rate": 6.193194101552502e-07,
"logits/chosen": -20.706968307495117,
"logits/rejected": -20.34494400024414,
"logps/chosen": -1.327618956565857,
"logps/rejected": -1.4798409938812256,
"loss": 1.3947,
"odds_ratio_loss": 0.6709089279174805,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.1327619105577469,
"rewards/margins": 0.01522219367325306,
"rewards/rejected": -0.147984117269516,
"sft_loss": 1.327618956565857,
"step": 1300
},
{
"epoch": 2.329406534785508,
"grad_norm": 2.9888756275177,
"learning_rate": 5.889542205864083e-07,
"logits/chosen": -20.558048248291016,
"logits/rejected": -20.51228904724121,
"logps/chosen": -1.3237196207046509,
"logps/rejected": -1.4659796953201294,
"loss": 1.3931,
"odds_ratio_loss": 0.6935244798660278,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.13237197697162628,
"rewards/margins": 0.014225991442799568,
"rewards/rejected": -0.1465979516506195,
"sft_loss": 1.3237196207046509,
"step": 1310
},
{
"epoch": 2.34718826405868,
"grad_norm": 1.8925628662109375,
"learning_rate": 5.592526130947862e-07,
"logits/chosen": -20.927398681640625,
"logits/rejected": -20.855573654174805,
"logps/chosen": -1.3616701364517212,
"logps/rejected": -1.4716918468475342,
"loss": 1.4372,
"odds_ratio_loss": 0.7552787065505981,
"rewards/accuracies": 0.4437499940395355,
"rewards/chosen": -0.1361670196056366,
"rewards/margins": 0.011002160608768463,
"rewards/rejected": -0.14716917276382446,
"sft_loss": 1.3616701364517212,
"step": 1320
},
{
"epoch": 2.3649699933318518,
"grad_norm": 2.2592906951904297,
"learning_rate": 5.302249011195507e-07,
"logits/chosen": -20.643238067626953,
"logits/rejected": -20.706254959106445,
"logps/chosen": -1.286387324333191,
"logps/rejected": -1.3748500347137451,
"loss": 1.3577,
"odds_ratio_loss": 0.712990939617157,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.12863874435424805,
"rewards/margins": 0.008846262469887733,
"rewards/rejected": -0.13748499751091003,
"sft_loss": 1.286387324333191,
"step": 1330
},
{
"epoch": 2.382751722605023,
"grad_norm": 3.97871470451355,
"learning_rate": 5.018811640997307e-07,
"logits/chosen": -20.570959091186523,
"logits/rejected": -20.81188201904297,
"logps/chosen": -1.383998155593872,
"logps/rejected": -1.646945595741272,
"loss": 1.4509,
"odds_ratio_loss": 0.6689848899841309,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.13839980959892273,
"rewards/margins": 0.026294732466340065,
"rewards/rejected": -0.16469456255435944,
"sft_loss": 1.383998155593872,
"step": 1340
},
{
"epoch": 2.400533451878195,
"grad_norm": 1.2727420330047607,
"learning_rate": 4.7423124397427105e-07,
"logits/chosen": -20.430959701538086,
"logits/rejected": -20.650379180908203,
"logps/chosen": -1.35360848903656,
"logps/rejected": -1.4245867729187012,
"loss": 1.4266,
"odds_ratio_loss": 0.7302565574645996,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.13536083698272705,
"rewards/margins": 0.007097836583852768,
"rewards/rejected": -0.1424586921930313,
"sft_loss": 1.35360848903656,
"step": 1350
},
{
"epoch": 2.418315181151367,
"grad_norm": 2.574122428894043,
"learning_rate": 4.472847417645787e-07,
"logits/chosen": -20.755605697631836,
"logits/rejected": -20.458105087280273,
"logps/chosen": -1.3647658824920654,
"logps/rejected": -1.634526252746582,
"loss": 1.4294,
"odds_ratio_loss": 0.6465052366256714,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -0.13647659122943878,
"rewards/margins": 0.026976028457283974,
"rewards/rejected": -0.1634526252746582,
"sft_loss": 1.3647658824920654,
"step": 1360
},
{
"epoch": 2.436096910424539,
"grad_norm": 1.2747830152511597,
"learning_rate": 4.210510142406993e-07,
"logits/chosen": -20.725910186767578,
"logits/rejected": -20.539182662963867,
"logps/chosen": -1.3636181354522705,
"logps/rejected": -1.573249101638794,
"loss": 1.4312,
"odds_ratio_loss": 0.6754659414291382,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.13636180758476257,
"rewards/margins": 0.020963111892342567,
"rewards/rejected": -0.1573249250650406,
"sft_loss": 1.3636181354522705,
"step": 1370
},
{
"epoch": 2.4538786396977104,
"grad_norm": 1.5959084033966064,
"learning_rate": 3.9553917067232966e-07,
"logits/chosen": -20.685565948486328,
"logits/rejected": -20.612730026245117,
"logps/chosen": -1.3631963729858398,
"logps/rejected": -1.501734972000122,
"loss": 1.4351,
"odds_ratio_loss": 0.7191514372825623,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.13631963729858398,
"rewards/margins": 0.013853861019015312,
"rewards/rejected": -0.15017351508140564,
"sft_loss": 1.3631963729858398,
"step": 1380
},
{
"epoch": 2.4716603689708823,
"grad_norm": 1.8356739282608032,
"learning_rate": 3.707580696657509e-07,
"logits/chosen": -20.62293243408203,
"logits/rejected": -20.265270233154297,
"logps/chosen": -1.3268606662750244,
"logps/rejected": -1.4021425247192383,
"loss": 1.3996,
"odds_ratio_loss": 0.727665364742279,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.1326860636472702,
"rewards/margins": 0.007528189569711685,
"rewards/rejected": -0.1402142494916916,
"sft_loss": 1.3268606662750244,
"step": 1390
},
{
"epoch": 2.489442098244054,
"grad_norm": 2.215832471847534,
"learning_rate": 3.4671631608781815e-07,
"logits/chosen": -20.820430755615234,
"logits/rejected": -20.70709991455078,
"logps/chosen": -1.3705365657806396,
"logps/rejected": -1.4663138389587402,
"loss": 1.4448,
"odds_ratio_loss": 0.742554783821106,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.13705363869667053,
"rewards/margins": 0.009577738121151924,
"rewards/rejected": -0.1466313898563385,
"sft_loss": 1.3705365657806396,
"step": 1400
},
{
"epoch": 2.507223827517226,
"grad_norm": 2.842649221420288,
"learning_rate": 3.234222580780405e-07,
"logits/chosen": -20.579906463623047,
"logits/rejected": -20.50626564025879,
"logps/chosen": -1.3471759557724,
"logps/rejected": -1.4215319156646729,
"loss": 1.4184,
"odds_ratio_loss": 0.711919367313385,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.13471761345863342,
"rewards/margins": 0.007435592822730541,
"rewards/rejected": -0.14215318858623505,
"sft_loss": 1.3471759557724,
"step": 1410
},
{
"epoch": 2.525005556790398,
"grad_norm": 3.6692733764648438,
"learning_rate": 3.0088398414982375e-07,
"logits/chosen": -20.674327850341797,
"logits/rejected": -20.809429168701172,
"logps/chosen": -1.3552839756011963,
"logps/rejected": -1.5087939500808716,
"loss": 1.4268,
"odds_ratio_loss": 0.7148610353469849,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.13552840054035187,
"rewards/margins": 0.015351003035902977,
"rewards/rejected": -0.1508793979883194,
"sft_loss": 1.3552839756011963,
"step": 1420
},
{
"epoch": 2.54278728606357,
"grad_norm": 1.8147318363189697,
"learning_rate": 2.7910932038184487e-07,
"logits/chosen": -20.291900634765625,
"logits/rejected": -19.921438217163086,
"logps/chosen": -1.3218873739242554,
"logps/rejected": -1.475524663925171,
"loss": 1.3899,
"odds_ratio_loss": 0.6805239319801331,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.13218875229358673,
"rewards/margins": 0.015363717451691628,
"rewards/rejected": -0.1475524604320526,
"sft_loss": 1.3218873739242554,
"step": 1430
},
{
"epoch": 2.5605690153367417,
"grad_norm": 2.6163878440856934,
"learning_rate": 2.5810582770057325e-07,
"logits/chosen": -20.752613067626953,
"logits/rejected": -20.92694854736328,
"logps/chosen": -1.2694684267044067,
"logps/rejected": -1.3633973598480225,
"loss": 1.3412,
"odds_ratio_loss": 0.7170311212539673,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.1269468367099762,
"rewards/margins": 0.009392908774316311,
"rewards/rejected": -0.13633975386619568,
"sft_loss": 1.2694684267044067,
"step": 1440
},
{
"epoch": 2.578350744609913,
"grad_norm": 2.4267303943634033,
"learning_rate": 2.3788079925484402e-07,
"logits/chosen": -20.907817840576172,
"logits/rejected": -20.742984771728516,
"logps/chosen": -1.3328653573989868,
"logps/rejected": -1.4286470413208008,
"loss": 1.4041,
"odds_ratio_loss": 0.7125917673110962,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.1332865208387375,
"rewards/margins": 0.009578163735568523,
"rewards/rejected": -0.14286470413208008,
"sft_loss": 1.3328653573989868,
"step": 1450
},
{
"epoch": 2.596132473883085,
"grad_norm": 2.563065528869629,
"learning_rate": 2.1844125788342661e-07,
"logits/chosen": -20.36819076538086,
"logits/rejected": -20.245798110961914,
"logps/chosen": -1.3011656999588013,
"logps/rejected": -1.601665735244751,
"loss": 1.3692,
"odds_ratio_loss": 0.679993212223053,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.1301165670156479,
"rewards/margins": 0.030050003901124,
"rewards/rejected": -0.16016657650470734,
"sft_loss": 1.3011656999588013,
"step": 1460
},
{
"epoch": 2.613914203156257,
"grad_norm": 1.592044711112976,
"learning_rate": 1.9979395367644428e-07,
"logits/chosen": -20.988916397094727,
"logits/rejected": -20.9386043548584,
"logps/chosen": -1.2825675010681152,
"logps/rejected": -1.444154977798462,
"loss": 1.3506,
"odds_ratio_loss": 0.6804038286209106,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.12825676798820496,
"rewards/margins": 0.01615874283015728,
"rewards/rejected": -0.1444154977798462,
"sft_loss": 1.2825675010681152,
"step": 1470
},
{
"epoch": 2.631695932429429,
"grad_norm": 3.1699938774108887,
"learning_rate": 1.81945361631512e-07,
"logits/chosen": -21.14181900024414,
"logits/rejected": -21.21465301513672,
"logps/chosen": -1.3464009761810303,
"logps/rejected": -1.4395297765731812,
"loss": 1.4199,
"odds_ratio_loss": 0.7345655560493469,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.13464009761810303,
"rewards/margins": 0.00931286346167326,
"rewards/rejected": -0.14395298063755035,
"sft_loss": 1.3464009761810303,
"step": 1480
},
{
"epoch": 2.6494776617026004,
"grad_norm": 2.9426472187042236,
"learning_rate": 1.6490167940538343e-07,
"logits/chosen": -20.980464935302734,
"logits/rejected": -20.800823211669922,
"logps/chosen": -1.3066675662994385,
"logps/rejected": -1.4621120691299438,
"loss": 1.3758,
"odds_ratio_loss": 0.691235363483429,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.13066676259040833,
"rewards/margins": 0.015544441528618336,
"rewards/rejected": -0.1462111920118332,
"sft_loss": 1.3066675662994385,
"step": 1490
},
{
"epoch": 2.6672593909757722,
"grad_norm": 2.0618135929107666,
"learning_rate": 1.4866882516191339e-07,
"logits/chosen": -20.438335418701172,
"logits/rejected": -20.611011505126953,
"logps/chosen": -1.3370510339736938,
"logps/rejected": -1.4072545766830444,
"loss": 1.4105,
"odds_ratio_loss": 0.7341033220291138,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.13370510935783386,
"rewards/margins": 0.007020360324531794,
"rewards/rejected": -0.14072546362876892,
"sft_loss": 1.3370510339736938,
"step": 1500
},
{
"epoch": 2.6672593909757722,
"eval_logits/chosen": -20.655466079711914,
"eval_logits/rejected": -20.744272232055664,
"eval_logps/chosen": -1.3319367170333862,
"eval_logps/rejected": -1.5009632110595703,
"eval_loss": 1.3996269702911377,
"eval_odds_ratio_loss": 0.67690110206604,
"eval_rewards/accuracies": 0.5460000038146973,
"eval_rewards/chosen": -0.133193701505661,
"eval_rewards/margins": 0.016902634873986244,
"eval_rewards/rejected": -0.1500963419675827,
"eval_runtime": 80.0771,
"eval_samples_per_second": 12.488,
"eval_sft_loss": 1.3319367170333862,
"eval_steps_per_second": 6.244,
"step": 1500
},
{
"epoch": 2.685041120248944,
"grad_norm": 4.3807573318481445,
"learning_rate": 1.3325243551706057e-07,
"logits/chosen": -20.31595802307129,
"logits/rejected": -20.66552734375,
"logps/chosen": -1.3314330577850342,
"logps/rejected": -1.618486762046814,
"loss": 1.3972,
"odds_ratio_loss": 0.6577640175819397,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.13314330577850342,
"rewards/margins": 0.028705382719635963,
"rewards/rejected": -0.16184869408607483,
"sft_loss": 1.3314330577850342,
"step": 1510
},
{
"epoch": 2.702822849522116,
"grad_norm": 5.399725437164307,
"learning_rate": 1.1865786358165737e-07,
"logits/chosen": -20.374225616455078,
"logits/rejected": -20.766555786132812,
"logps/chosen": -1.3559472560882568,
"logps/rejected": -1.465380311012268,
"loss": 1.4273,
"odds_ratio_loss": 0.71399986743927,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.13559472560882568,
"rewards/margins": 0.010943309403955936,
"rewards/rejected": -0.14653804898262024,
"sft_loss": 1.3559472560882568,
"step": 1520
},
{
"epoch": 2.720604578795288,
"grad_norm": 5.161293029785156,
"learning_rate": 1.0489017710262311e-07,
"logits/chosen": -20.828411102294922,
"logits/rejected": -21.007801055908203,
"logps/chosen": -1.3781417608261108,
"logps/rejected": -1.6057850122451782,
"loss": 1.4502,
"odds_ratio_loss": 0.7206953763961792,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.13781419396400452,
"rewards/margins": 0.02276432514190674,
"rewards/rejected": -0.16057850420475006,
"sft_loss": 1.3781417608261108,
"step": 1530
},
{
"epoch": 2.73838630806846,
"grad_norm": 1.9645308256149292,
"learning_rate": 9.195415670326446e-08,
"logits/chosen": -20.687061309814453,
"logits/rejected": -20.75905990600586,
"logps/chosen": -1.3485379219055176,
"logps/rejected": -1.5022733211517334,
"loss": 1.4177,
"odds_ratio_loss": 0.6911865472793579,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.1348538100719452,
"rewards/margins": 0.015373537316918373,
"rewards/rejected": -0.150227352976799,
"sft_loss": 1.3485379219055176,
"step": 1540
},
{
"epoch": 2.7561680373416317,
"grad_norm": 2.820127010345459,
"learning_rate": 7.985429422327384e-08,
"logits/chosen": -20.722209930419922,
"logits/rejected": -20.719024658203125,
"logps/chosen": -1.3103783130645752,
"logps/rejected": -1.3722031116485596,
"loss": 1.3834,
"odds_ratio_loss": 0.7300290465354919,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.1310378611087799,
"rewards/margins": 0.006182484794408083,
"rewards/rejected": -0.1372203379869461,
"sft_loss": 1.3103783130645752,
"step": 1550
},
{
"epoch": 2.773949766614803,
"grad_norm": 3.8620612621307373,
"learning_rate": 6.859479115900818e-08,
"logits/chosen": -20.64493179321289,
"logits/rejected": -20.700607299804688,
"logps/chosen": -1.4513204097747803,
"logps/rejected": -1.611519455909729,
"loss": 1.5216,
"odds_ratio_loss": 0.7024157047271729,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.14513204991817474,
"rewards/margins": 0.01601991057395935,
"rewards/rejected": -0.1611519604921341,
"sft_loss": 1.4513204097747803,
"step": 1560
},
{
"epoch": 2.791731495887975,
"grad_norm": 8.068270683288574,
"learning_rate": 5.817955720457902e-08,
"logits/chosen": -20.495128631591797,
"logits/rejected": -20.559017181396484,
"logps/chosen": -1.27366042137146,
"logps/rejected": -1.3641878366470337,
"loss": 1.3449,
"odds_ratio_loss": 0.7120680809020996,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.12736603617668152,
"rewards/margins": 0.009052738547325134,
"rewards/rejected": -0.13641878962516785,
"sft_loss": 1.27366042137146,
"step": 1570
},
{
"epoch": 2.809513225161147,
"grad_norm": 3.788001775741577,
"learning_rate": 4.861220889427199e-08,
"logits/chosen": -20.778738021850586,
"logits/rejected": -20.58936309814453,
"logps/chosen": -1.3479427099227905,
"logps/rejected": -1.4240316152572632,
"loss": 1.423,
"odds_ratio_loss": 0.7501288652420044,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.13479426503181458,
"rewards/margins": 0.007608892861753702,
"rewards/rejected": -0.14240317046642303,
"sft_loss": 1.3479427099227905,
"step": 1580
},
{
"epoch": 2.827294954434319,
"grad_norm": 1.3882092237472534,
"learning_rate": 3.9896068346758074e-08,
"logits/chosen": -20.645978927612305,
"logits/rejected": -20.691020965576172,
"logps/chosen": -1.3436458110809326,
"logps/rejected": -1.4700592756271362,
"loss": 1.4137,
"odds_ratio_loss": 0.7008241415023804,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.13436457514762878,
"rewards/margins": 0.012641333043575287,
"rewards/rejected": -0.14700593054294586,
"sft_loss": 1.3436458110809326,
"step": 1590
},
{
"epoch": 2.8450766837074903,
"grad_norm": 3.9510364532470703,
"learning_rate": 3.203416211153832e-08,
"logits/chosen": -20.51412582397461,
"logits/rejected": -20.81777572631836,
"logps/chosen": -1.331721544265747,
"logps/rejected": -1.4455146789550781,
"loss": 1.4049,
"odds_ratio_loss": 0.7322754859924316,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -0.1331721693277359,
"rewards/margins": 0.011379324831068516,
"rewards/rejected": -0.14455147087574005,
"sft_loss": 1.331721544265747,
"step": 1600
},
{
"epoch": 2.8628584129806622,
"grad_norm": 5.2995758056640625,
"learning_rate": 2.5029220118019393e-08,
"logits/chosen": -20.452526092529297,
"logits/rejected": -20.59510612487793,
"logps/chosen": -1.3901276588439941,
"logps/rejected": -1.4555182456970215,
"loss": 1.4637,
"odds_ratio_loss": 0.7356002330780029,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.13901275396347046,
"rewards/margins": 0.006539070047438145,
"rewards/rejected": -0.14555183053016663,
"sft_loss": 1.3901276588439941,
"step": 1610
},
{
"epoch": 2.880640142253834,
"grad_norm": 2.9100406169891357,
"learning_rate": 1.8883674727586122e-08,
"logits/chosen": -20.593090057373047,
"logits/rejected": -20.566762924194336,
"logps/chosen": -1.2591346502304077,
"logps/rejected": -1.5285673141479492,
"loss": 1.3231,
"odds_ratio_loss": 0.6392361521720886,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.12591347098350525,
"rewards/margins": 0.026943260803818703,
"rewards/rejected": -0.1528567224740982,
"sft_loss": 1.2591346502304077,
"step": 1620
},
{
"epoch": 2.898421871527006,
"grad_norm": 1.2503418922424316,
"learning_rate": 1.3599659889000639e-08,
"logits/chosen": -20.94070816040039,
"logits/rejected": -20.831439971923828,
"logps/chosen": -1.3583745956420898,
"logps/rejected": -1.4623037576675415,
"loss": 1.4301,
"odds_ratio_loss": 0.7174537181854248,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": -0.13583745062351227,
"rewards/margins": 0.010392926633358002,
"rewards/rejected": -0.14623036980628967,
"sft_loss": 1.3583745956420898,
"step": 1630
},
{
"epoch": 2.916203600800178,
"grad_norm": 1.5849334001541138,
"learning_rate": 9.179010397421528e-09,
"logits/chosen": -20.463802337646484,
"logits/rejected": -20.60258674621582,
"logps/chosen": -1.3220821619033813,
"logps/rejected": -1.4576328992843628,
"loss": 1.3923,
"odds_ratio_loss": 0.7020986676216125,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.1322081983089447,
"rewards/margins": 0.01355508528649807,
"rewards/rejected": -0.1457633078098297,
"sft_loss": 1.3220821619033813,
"step": 1640
},
{
"epoch": 2.93398533007335,
"grad_norm": 1.5637987852096558,
"learning_rate": 5.623261257296509e-09,
"logits/chosen": -20.550914764404297,
"logits/rejected": -20.748790740966797,
"logps/chosen": -1.2091234922409058,
"logps/rejected": -1.3757600784301758,
"loss": 1.2759,
"odds_ratio_loss": 0.6680835485458374,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.1209123507142067,
"rewards/margins": 0.01666366681456566,
"rewards/rejected": -0.13757601380348206,
"sft_loss": 1.2091234922409058,
"step": 1650
},
{
"epoch": 2.9517670593465217,
"grad_norm": 2.060124158859253,
"learning_rate": 2.933647149357122e-09,
"logits/chosen": -20.7076358795166,
"logits/rejected": -20.74884605407715,
"logps/chosen": -1.344455361366272,
"logps/rejected": -1.4792556762695312,
"loss": 1.415,
"odds_ratio_loss": 0.7057270407676697,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.13444553315639496,
"rewards/margins": 0.013480030000209808,
"rewards/rejected": -0.14792557060718536,
"sft_loss": 1.344455361366272,
"step": 1660
},
{
"epoch": 2.969548788619693,
"grad_norm": 1.8397283554077148,
"learning_rate": 1.1111020018930717e-09,
"logits/chosen": -20.895946502685547,
"logits/rejected": -20.76508903503418,
"logps/chosen": -1.319896936416626,
"logps/rejected": -1.4024416208267212,
"loss": 1.3912,
"odds_ratio_loss": 0.7127953767776489,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.13198968768119812,
"rewards/margins": 0.008254442363977432,
"rewards/rejected": -0.14024415612220764,
"sft_loss": 1.319896936416626,
"step": 1670
},
{
"epoch": 2.987330517892865,
"grad_norm": 1.2750743627548218,
"learning_rate": 1.5625866646051813e-10,
"logits/chosen": -20.67104721069336,
"logits/rejected": -20.601062774658203,
"logps/chosen": -1.2744053602218628,
"logps/rejected": -1.4778211116790771,
"loss": 1.3388,
"odds_ratio_loss": 0.6436463594436646,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.12744054198265076,
"rewards/margins": 0.02034156210720539,
"rewards/rejected": -0.1477821171283722,
"sft_loss": 1.2744053602218628,
"step": 1680
},
{
"epoch": 2.997999555456768,
"step": 1686,
"total_flos": 5.313908590588723e+17,
"train_loss": 1.477055920117832,
"train_runtime": 8055.1491,
"train_samples_per_second": 3.351,
"train_steps_per_second": 0.209
}
],
"logging_steps": 10,
"max_steps": 1686,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"total_flos": 5.313908590588723e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}