ARMZyany's picture
Upload 12 files
5609a8b verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.21340162185232608,
"eval_steps": 500,
"global_step": 6500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0008207754686627926,
"grad_norm": 158.0,
"learning_rate": 1.312910284463895e-07,
"logits/chosen": -4.1174211502075195,
"logits/rejected": -4.145937442779541,
"logps/chosen": -750.2059326171875,
"logps/rejected": -523.7258911132812,
"loss": 0.6934,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.007276252377778292,
"rewards/margins": 0.0009733623010106385,
"rewards/rejected": -0.0082496153190732,
"step": 25
},
{
"epoch": 0.0016415509373255853,
"grad_norm": 150.0,
"learning_rate": 2.680525164113786e-07,
"logits/chosen": -4.106570720672607,
"logits/rejected": -4.1443281173706055,
"logps/chosen": -741.0921630859375,
"logps/rejected": -502.666259765625,
"loss": 0.6987,
"rewards/accuracies": 0.46000000834465027,
"rewards/chosen": -0.002752303844317794,
"rewards/margins": -0.008955330587923527,
"rewards/rejected": 0.006203026045113802,
"step": 50
},
{
"epoch": 0.002462326405988378,
"grad_norm": 237.0,
"learning_rate": 4.0481400437636766e-07,
"logits/chosen": -4.268686771392822,
"logits/rejected": -4.344180107116699,
"logps/chosen": -718.856201171875,
"logps/rejected": -500.9825744628906,
"loss": 0.702,
"rewards/accuracies": 0.47999998927116394,
"rewards/chosen": -0.01361551322042942,
"rewards/margins": -0.016321375966072083,
"rewards/rejected": 0.0027058636769652367,
"step": 75
},
{
"epoch": 0.0032831018746511706,
"grad_norm": 216.0,
"learning_rate": 5.415754923413568e-07,
"logits/chosen": -4.252306938171387,
"logits/rejected": -4.244429111480713,
"logps/chosen": -663.5410766601562,
"logps/rejected": -519.08984375,
"loss": 0.6935,
"rewards/accuracies": 0.5400000214576721,
"rewards/chosen": -0.0002230451937066391,
"rewards/margins": 0.000489498081151396,
"rewards/rejected": -0.0007125435513444245,
"step": 100
},
{
"epoch": 0.004103877343313963,
"grad_norm": 96.5,
"learning_rate": 6.783369803063458e-07,
"logits/chosen": -4.129316806793213,
"logits/rejected": -4.208637714385986,
"logps/chosen": -719.7318725585938,
"logps/rejected": -521.5628662109375,
"loss": 0.6934,
"rewards/accuracies": 0.5199999809265137,
"rewards/chosen": 0.002255064435303211,
"rewards/margins": 0.0010580136440694332,
"rewards/rejected": 0.0011970511404797435,
"step": 125
},
{
"epoch": 0.004924652811976756,
"grad_norm": 161.0,
"learning_rate": 8.150984682713349e-07,
"logits/chosen": -4.199742794036865,
"logits/rejected": -4.159580707550049,
"logps/chosen": -695.33984375,
"logps/rejected": -532.4611206054688,
"loss": 0.7009,
"rewards/accuracies": 0.4000000059604645,
"rewards/chosen": -0.006176384165883064,
"rewards/margins": -0.014046883210539818,
"rewards/rejected": 0.007870499044656754,
"step": 150
},
{
"epoch": 0.005745428280639548,
"grad_norm": 201.0,
"learning_rate": 9.518599562363239e-07,
"logits/chosen": -4.214637279510498,
"logits/rejected": -4.182404041290283,
"logps/chosen": -777.0204467773438,
"logps/rejected": -569.65771484375,
"loss": 0.6915,
"rewards/accuracies": 0.47999998927116394,
"rewards/chosen": -0.0014341524802148342,
"rewards/margins": 0.004235363565385342,
"rewards/rejected": -0.0056695155799388885,
"step": 175
},
{
"epoch": 0.006566203749302341,
"grad_norm": 270.0,
"learning_rate": 1.088621444201313e-06,
"logits/chosen": -4.112586498260498,
"logits/rejected": -4.159411907196045,
"logps/chosen": -700.7896728515625,
"logps/rejected": -617.4669799804688,
"loss": 0.703,
"rewards/accuracies": 0.4000000059604645,
"rewards/chosen": -0.013239642605185509,
"rewards/margins": -0.01829313486814499,
"rewards/rejected": 0.005053492728620768,
"step": 200
},
{
"epoch": 0.007386979217965133,
"grad_norm": 320.0,
"learning_rate": 1.225382932166302e-06,
"logits/chosen": -4.012618541717529,
"logits/rejected": -4.056581020355225,
"logps/chosen": -560.5947875976562,
"logps/rejected": -470.4196472167969,
"loss": 0.6863,
"rewards/accuracies": 0.6200000047683716,
"rewards/chosen": 0.00924967136234045,
"rewards/margins": 0.01506039034575224,
"rewards/rejected": -0.005810718517750502,
"step": 225
},
{
"epoch": 0.008207754686627926,
"grad_norm": 80.5,
"learning_rate": 1.3621444201312912e-06,
"logits/chosen": -3.9422268867492676,
"logits/rejected": -4.169854640960693,
"logps/chosen": -868.24365234375,
"logps/rejected": -700.423583984375,
"loss": 0.6884,
"rewards/accuracies": 0.5400000214576721,
"rewards/chosen": 0.0005125462776049972,
"rewards/margins": 0.011028454639017582,
"rewards/rejected": -0.010515906848013401,
"step": 250
},
{
"epoch": 0.00902853015529072,
"grad_norm": 226.0,
"learning_rate": 1.4989059080962803e-06,
"logits/chosen": -4.265016078948975,
"logits/rejected": -4.272618770599365,
"logps/chosen": -698.0468139648438,
"logps/rejected": -537.390380859375,
"loss": 0.6912,
"rewards/accuracies": 0.4399999976158142,
"rewards/chosen": 0.005534702911973,
"rewards/margins": 0.0049882736057043076,
"rewards/rejected": 0.0005464285495691001,
"step": 275
},
{
"epoch": 0.009849305623953511,
"grad_norm": 258.0,
"learning_rate": 1.6356673960612692e-06,
"logits/chosen": -4.159086227416992,
"logits/rejected": -4.134156703948975,
"logps/chosen": -703.7493896484375,
"logps/rejected": -520.1746826171875,
"loss": 0.7016,
"rewards/accuracies": 0.3799999952316284,
"rewards/chosen": -0.00535870436578989,
"rewards/margins": -0.015425672754645348,
"rewards/rejected": 0.010066968388855457,
"step": 300
},
{
"epoch": 0.010670081092616303,
"grad_norm": 147.0,
"learning_rate": 1.7724288840262582e-06,
"logits/chosen": -4.203159809112549,
"logits/rejected": -4.189880847930908,
"logps/chosen": -780.8976440429688,
"logps/rejected": -511.4765625,
"loss": 0.6884,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.016911819577217102,
"rewards/margins": 0.010711951181292534,
"rewards/rejected": 0.006199866533279419,
"step": 325
},
{
"epoch": 0.011490856561279097,
"grad_norm": 223.0,
"learning_rate": 1.9091903719912473e-06,
"logits/chosen": -4.209036350250244,
"logits/rejected": -4.1973419189453125,
"logps/chosen": -747.6294555664062,
"logps/rejected": -575.0943603515625,
"loss": 0.6953,
"rewards/accuracies": 0.5400000214576721,
"rewards/chosen": 0.006754291243851185,
"rewards/margins": -0.0026557582896202803,
"rewards/rejected": 0.009410049766302109,
"step": 350
},
{
"epoch": 0.012311632029941889,
"grad_norm": 207.0,
"learning_rate": 2.0459518599562366e-06,
"logits/chosen": -4.2033867835998535,
"logits/rejected": -4.380429267883301,
"logps/chosen": -750.8778076171875,
"logps/rejected": -514.9067993164062,
"loss": 0.6839,
"rewards/accuracies": 0.5199999809265137,
"rewards/chosen": 0.02162899449467659,
"rewards/margins": 0.019526075571775436,
"rewards/rejected": 0.0021029210183769464,
"step": 375
},
{
"epoch": 0.013132407498604682,
"grad_norm": 160.0,
"learning_rate": 2.1827133479212255e-06,
"logits/chosen": -4.284715175628662,
"logits/rejected": -4.265630722045898,
"logps/chosen": -698.50439453125,
"logps/rejected": -537.8710327148438,
"loss": 0.6825,
"rewards/accuracies": 0.6399999856948853,
"rewards/chosen": 0.01697949506342411,
"rewards/margins": 0.022279297932982445,
"rewards/rejected": -0.005299802869558334,
"step": 400
},
{
"epoch": 0.013953182967267474,
"grad_norm": 171.0,
"learning_rate": 2.3194748358862144e-06,
"logits/chosen": -4.165258407592773,
"logits/rejected": -4.168488025665283,
"logps/chosen": -791.05419921875,
"logps/rejected": -458.402587890625,
"loss": 0.6981,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.0009704286349005997,
"rewards/margins": -0.008310976438224316,
"rewards/rejected": 0.009281404316425323,
"step": 425
},
{
"epoch": 0.014773958435930266,
"grad_norm": 170.0,
"learning_rate": 2.4562363238512038e-06,
"logits/chosen": -4.120928764343262,
"logits/rejected": -4.241576671600342,
"logps/chosen": -773.9873657226562,
"logps/rejected": -557.421142578125,
"loss": 0.6817,
"rewards/accuracies": 0.6800000071525574,
"rewards/chosen": 0.027397265657782555,
"rewards/margins": 0.024752546101808548,
"rewards/rejected": 0.0026447183918207884,
"step": 450
},
{
"epoch": 0.01559473390459306,
"grad_norm": 248.0,
"learning_rate": 2.592997811816193e-06,
"logits/chosen": -3.9233641624450684,
"logits/rejected": -4.040389060974121,
"logps/chosen": -659.5145874023438,
"logps/rejected": -562.8936767578125,
"loss": 0.6946,
"rewards/accuracies": 0.4000000059604645,
"rewards/chosen": -0.00023105592117644846,
"rewards/margins": -0.0014622471062466502,
"rewards/rejected": 0.0012311902828514576,
"step": 475
},
{
"epoch": 0.01641550937325585,
"grad_norm": 218.0,
"learning_rate": 2.7297592997811816e-06,
"logits/chosen": -4.040191173553467,
"logits/rejected": -4.220387935638428,
"logps/chosen": -694.8212280273438,
"logps/rejected": -592.8945922851562,
"loss": 0.6992,
"rewards/accuracies": 0.4399999976158142,
"rewards/chosen": 0.015942150726914406,
"rewards/margins": -0.010712460614740849,
"rewards/rejected": 0.02665461041033268,
"step": 500
},
{
"epoch": 0.017236284841918643,
"grad_norm": 111.5,
"learning_rate": 2.866520787746171e-06,
"logits/chosen": -3.9839234352111816,
"logits/rejected": -4.0927205085754395,
"logps/chosen": -850.669921875,
"logps/rejected": -657.936279296875,
"loss": 0.6903,
"rewards/accuracies": 0.5400000214576721,
"rewards/chosen": 0.016984395682811737,
"rewards/margins": 0.0072107817977666855,
"rewards/rejected": 0.009773612953722477,
"step": 525
},
{
"epoch": 0.01805706031058144,
"grad_norm": 308.0,
"learning_rate": 3.0032822757111603e-06,
"logits/chosen": -4.234395503997803,
"logits/rejected": -4.200852394104004,
"logps/chosen": -867.9359130859375,
"logps/rejected": -640.5671997070312,
"loss": 0.6912,
"rewards/accuracies": 0.47999998927116394,
"rewards/chosen": 0.017692890018224716,
"rewards/margins": 0.005810171365737915,
"rewards/rejected": 0.011882718652486801,
"step": 550
},
{
"epoch": 0.01887783577924423,
"grad_norm": 193.0,
"learning_rate": 3.1400437636761488e-06,
"logits/chosen": -4.274899482727051,
"logits/rejected": -4.361607551574707,
"logps/chosen": -772.3294677734375,
"logps/rejected": -483.2698974609375,
"loss": 0.6799,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.027177538722753525,
"rewards/margins": 0.028825119137763977,
"rewards/rejected": -0.001647579250857234,
"step": 575
},
{
"epoch": 0.019698611247907023,
"grad_norm": 166.0,
"learning_rate": 3.276805251641138e-06,
"logits/chosen": -4.136162281036377,
"logits/rejected": -4.200733661651611,
"logps/chosen": -719.72998046875,
"logps/rejected": -528.1305541992188,
"loss": 0.6876,
"rewards/accuracies": 0.5199999809265137,
"rewards/chosen": 0.019943855702877045,
"rewards/margins": 0.012945435009896755,
"rewards/rejected": 0.006998421624302864,
"step": 600
},
{
"epoch": 0.020519386716569814,
"grad_norm": 160.0,
"learning_rate": 3.4135667396061274e-06,
"logits/chosen": -4.017778396606445,
"logits/rejected": -4.027612209320068,
"logps/chosen": -691.2648315429688,
"logps/rejected": -643.5048217773438,
"loss": 0.6981,
"rewards/accuracies": 0.41999998688697815,
"rewards/chosen": -0.003029178362339735,
"rewards/margins": -0.008131473325192928,
"rewards/rejected": 0.005102294497191906,
"step": 625
},
{
"epoch": 0.021340162185232606,
"grad_norm": 288.0,
"learning_rate": 3.5503282275711163e-06,
"logits/chosen": -4.192790508270264,
"logits/rejected": -4.225100517272949,
"logps/chosen": -628.4151000976562,
"logps/rejected": -418.1283874511719,
"loss": 0.6874,
"rewards/accuracies": 0.5400000214576721,
"rewards/chosen": 0.031290203332901,
"rewards/margins": 0.013368282467126846,
"rewards/rejected": 0.017921922728419304,
"step": 650
},
{
"epoch": 0.0221609376538954,
"grad_norm": 103.0,
"learning_rate": 3.6870897155361052e-06,
"logits/chosen": -4.11993408203125,
"logits/rejected": -4.221017360687256,
"logps/chosen": -701.9328002929688,
"logps/rejected": -584.27294921875,
"loss": 0.6912,
"rewards/accuracies": 0.6399999856948853,
"rewards/chosen": 0.018245697021484375,
"rewards/margins": 0.0061894748359918594,
"rewards/rejected": 0.012056220322847366,
"step": 675
},
{
"epoch": 0.022981713122558194,
"grad_norm": 238.0,
"learning_rate": 3.823851203501095e-06,
"logits/chosen": -4.17140007019043,
"logits/rejected": -4.289581775665283,
"logps/chosen": -705.1632690429688,
"logps/rejected": -457.5531005859375,
"loss": 0.6833,
"rewards/accuracies": 0.5199999809265137,
"rewards/chosen": 0.03122738189995289,
"rewards/margins": 0.02252998761832714,
"rewards/rejected": 0.008697391487658024,
"step": 700
},
{
"epoch": 0.023802488591220985,
"grad_norm": 215.0,
"learning_rate": 3.9606126914660835e-06,
"logits/chosen": -4.087902069091797,
"logits/rejected": -4.143421173095703,
"logps/chosen": -790.4818725585938,
"logps/rejected": -638.943359375,
"loss": 0.6754,
"rewards/accuracies": 0.6399999856948853,
"rewards/chosen": 0.04056026414036751,
"rewards/margins": 0.03895123302936554,
"rewards/rejected": 0.0016090321587398648,
"step": 725
},
{
"epoch": 0.024623264059883777,
"grad_norm": 236.0,
"learning_rate": 4.097374179431072e-06,
"logits/chosen": -4.093388557434082,
"logits/rejected": -4.1176371574401855,
"logps/chosen": -811.650634765625,
"logps/rejected": -576.680908203125,
"loss": 0.6658,
"rewards/accuracies": 0.7400000095367432,
"rewards/chosen": 0.05430950969457626,
"rewards/margins": 0.05836718529462814,
"rewards/rejected": -0.004057674203068018,
"step": 750
},
{
"epoch": 0.02544403952854657,
"grad_norm": 270.0,
"learning_rate": 4.234135667396061e-06,
"logits/chosen": -4.053430557250977,
"logits/rejected": -4.1902666091918945,
"logps/chosen": -639.8978881835938,
"logps/rejected": -590.718505859375,
"loss": 0.6847,
"rewards/accuracies": 0.5400000214576721,
"rewards/chosen": 0.026400210335850716,
"rewards/margins": 0.01981959119439125,
"rewards/rejected": 0.006580619607120752,
"step": 775
},
{
"epoch": 0.026264814997209365,
"grad_norm": 183.0,
"learning_rate": 4.370897155361051e-06,
"logits/chosen": -4.134805202484131,
"logits/rejected": -4.280844211578369,
"logps/chosen": -688.987060546875,
"logps/rejected": -571.3134155273438,
"loss": 0.6725,
"rewards/accuracies": 0.6399999856948853,
"rewards/chosen": 0.051876142621040344,
"rewards/margins": 0.044371940195560455,
"rewards/rejected": 0.0075042033568024635,
"step": 800
},
{
"epoch": 0.027085590465872156,
"grad_norm": 158.0,
"learning_rate": 4.50765864332604e-06,
"logits/chosen": -4.2159342765808105,
"logits/rejected": -4.336707592010498,
"logps/chosen": -794.3070068359375,
"logps/rejected": -574.3153686523438,
"loss": 0.6961,
"rewards/accuracies": 0.46000000834465027,
"rewards/chosen": 0.014993507415056229,
"rewards/margins": -0.0014925742289051414,
"rewards/rejected": 0.016486085951328278,
"step": 825
},
{
"epoch": 0.02790636593453495,
"grad_norm": 219.0,
"learning_rate": 4.644420131291029e-06,
"logits/chosen": -4.15441370010376,
"logits/rejected": -4.211958408355713,
"logps/chosen": -692.42529296875,
"logps/rejected": -581.7979125976562,
"loss": 0.6822,
"rewards/accuracies": 0.6600000262260437,
"rewards/chosen": 0.04421781376004219,
"rewards/margins": 0.02522583305835724,
"rewards/rejected": 0.0189919825643301,
"step": 850
},
{
"epoch": 0.02872714140319774,
"grad_norm": 175.0,
"learning_rate": 4.781181619256018e-06,
"logits/chosen": -4.147443771362305,
"logits/rejected": -4.1150126457214355,
"logps/chosen": -791.9564819335938,
"logps/rejected": -645.9444580078125,
"loss": 0.6655,
"rewards/accuracies": 0.6600000262260437,
"rewards/chosen": 0.06266529858112335,
"rewards/margins": 0.06153992563486099,
"rewards/rejected": 0.0011253589764237404,
"step": 875
},
{
"epoch": 0.029547916871860532,
"grad_norm": 171.0,
"learning_rate": 4.917943107221007e-06,
"logits/chosen": -4.13357400894165,
"logits/rejected": -4.254169464111328,
"logps/chosen": -709.9840698242188,
"logps/rejected": -507.0233459472656,
"loss": 0.6634,
"rewards/accuracies": 0.6399999856948853,
"rewards/chosen": 0.046279050409793854,
"rewards/margins": 0.07115618884563446,
"rewards/rejected": -0.024877142161130905,
"step": 900
},
{
"epoch": 0.030368692340523327,
"grad_norm": 207.0,
"learning_rate": 4.99830766627179e-06,
"logits/chosen": -4.161207675933838,
"logits/rejected": -4.27424430847168,
"logps/chosen": -639.82373046875,
"logps/rejected": -429.6187744140625,
"loss": 0.6713,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.05205925926566124,
"rewards/margins": 0.0562448687851429,
"rewards/rejected": -0.00418561277911067,
"step": 925
},
{
"epoch": 0.03118946780918612,
"grad_norm": 85.0,
"learning_rate": 4.994076831951261e-06,
"logits/chosen": -4.219003677368164,
"logits/rejected": -4.230233669281006,
"logps/chosen": -647.91455078125,
"logps/rejected": -490.3548583984375,
"loss": 0.6668,
"rewards/accuracies": 0.6200000047683716,
"rewards/chosen": 0.06009820103645325,
"rewards/margins": 0.06201506033539772,
"rewards/rejected": -0.001916866865940392,
"step": 950
},
{
"epoch": 0.03201024327784891,
"grad_norm": 196.0,
"learning_rate": 4.989845997630734e-06,
"logits/chosen": -4.0881028175354,
"logits/rejected": -4.141018867492676,
"logps/chosen": -924.4205322265625,
"logps/rejected": -624.48046875,
"loss": 0.6708,
"rewards/accuracies": 0.5799999833106995,
"rewards/chosen": 0.058801230043172836,
"rewards/margins": 0.055282555520534515,
"rewards/rejected": 0.0035186754539608955,
"step": 975
},
{
"epoch": 0.0328310187465117,
"grad_norm": 88.5,
"learning_rate": 4.985615163310205e-06,
"logits/chosen": -4.2076311111450195,
"logits/rejected": -4.243706226348877,
"logps/chosen": -681.6548461914062,
"logps/rejected": -456.4216003417969,
"loss": 0.659,
"rewards/accuracies": 0.7400000095367432,
"rewards/chosen": 0.07433926314115524,
"rewards/margins": 0.08106620609760284,
"rewards/rejected": -0.006726943422108889,
"step": 1000
},
{
"epoch": 0.033651794215174495,
"grad_norm": 242.0,
"learning_rate": 4.981384328989678e-06,
"logits/chosen": -4.125373363494873,
"logits/rejected": -4.25338888168335,
"logps/chosen": -866.8526000976562,
"logps/rejected": -517.0289916992188,
"loss": 0.6917,
"rewards/accuracies": 0.47999998927116394,
"rewards/chosen": -0.002582031534984708,
"rewards/margins": 0.004485914018005133,
"rewards/rejected": -0.00706794299185276,
"step": 1025
},
{
"epoch": 0.03447256968383729,
"grad_norm": 260.0,
"learning_rate": 4.977153494669149e-06,
"logits/chosen": -4.289287090301514,
"logits/rejected": -4.317573070526123,
"logps/chosen": -766.9920043945312,
"logps/rejected": -571.254638671875,
"loss": 0.6883,
"rewards/accuracies": 0.4399999976158142,
"rewards/chosen": 0.02896983176469803,
"rewards/margins": 0.012178352102637291,
"rewards/rejected": 0.01679147779941559,
"step": 1050
},
{
"epoch": 0.03529334515250008,
"grad_norm": 270.0,
"learning_rate": 4.972922660348622e-06,
"logits/chosen": -4.071505069732666,
"logits/rejected": -4.080994129180908,
"logps/chosen": -811.8478393554688,
"logps/rejected": -614.4443359375,
"loss": 0.6888,
"rewards/accuracies": 0.5600000023841858,
"rewards/chosen": 0.028604382649064064,
"rewards/margins": 0.011533576995134354,
"rewards/rejected": 0.01707080751657486,
"step": 1075
},
{
"epoch": 0.03611412062116288,
"grad_norm": 294.0,
"learning_rate": 4.968691826028093e-06,
"logits/chosen": -4.189393520355225,
"logits/rejected": -4.166673183441162,
"logps/chosen": -789.1497802734375,
"logps/rejected": -534.8926391601562,
"loss": 0.6808,
"rewards/accuracies": 0.6399999856948853,
"rewards/chosen": 0.04065079241991043,
"rewards/margins": 0.027193991467356682,
"rewards/rejected": 0.0134567990899086,
"step": 1100
},
{
"epoch": 0.03693489608982567,
"grad_norm": 168.0,
"learning_rate": 4.964460991707566e-06,
"logits/chosen": -4.194692611694336,
"logits/rejected": -4.184408664703369,
"logps/chosen": -790.99072265625,
"logps/rejected": -579.170166015625,
"loss": 0.688,
"rewards/accuracies": 0.5400000214576721,
"rewards/chosen": 0.025631260126829147,
"rewards/margins": 0.012109901756048203,
"rewards/rejected": 0.01352135930210352,
"step": 1125
},
{
"epoch": 0.03775567155848846,
"grad_norm": 332.0,
"learning_rate": 4.960230157387037e-06,
"logits/chosen": -4.162527561187744,
"logits/rejected": -4.196642875671387,
"logps/chosen": -745.1271362304688,
"logps/rejected": -615.0593872070312,
"loss": 0.7154,
"rewards/accuracies": 0.5600000023841858,
"rewards/chosen": 0.05613722652196884,
"rewards/margins": -0.025486024096608162,
"rewards/rejected": 0.08162324875593185,
"step": 1150
},
{
"epoch": 0.03857644702715125,
"grad_norm": 284.0,
"learning_rate": 4.95599932306651e-06,
"logits/chosen": -4.014937877655029,
"logits/rejected": -4.0320024490356445,
"logps/chosen": -623.9722900390625,
"logps/rejected": -527.3934936523438,
"loss": 0.714,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.052386511117219925,
"rewards/margins": -0.019469745457172394,
"rewards/rejected": 0.07185625284910202,
"step": 1175
},
{
"epoch": 0.039397222495814045,
"grad_norm": 188.0,
"learning_rate": 4.951768488745981e-06,
"logits/chosen": -4.1850738525390625,
"logits/rejected": -4.26005220413208,
"logps/chosen": -761.0906372070312,
"logps/rejected": -467.2315979003906,
"loss": 0.6765,
"rewards/accuracies": 0.6800000071525574,
"rewards/chosen": 0.07359949499368668,
"rewards/margins": 0.03598727285861969,
"rewards/rejected": 0.03761221468448639,
"step": 1200
},
{
"epoch": 0.04021799796447684,
"grad_norm": 262.0,
"learning_rate": 4.947537654425454e-06,
"logits/chosen": -4.217501163482666,
"logits/rejected": -4.082854747772217,
"logps/chosen": -671.0990600585938,
"logps/rejected": -584.4004516601562,
"loss": 0.6815,
"rewards/accuracies": 0.5400000214576721,
"rewards/chosen": 0.07619458436965942,
"rewards/margins": 0.028806446120142937,
"rewards/rejected": 0.047388140112161636,
"step": 1225
},
{
"epoch": 0.04103877343313963,
"grad_norm": 114.5,
"learning_rate": 4.943306820104925e-06,
"logits/chosen": -4.066970348358154,
"logits/rejected": -4.118155479431152,
"logps/chosen": -707.7814331054688,
"logps/rejected": -522.02880859375,
"loss": 0.6886,
"rewards/accuracies": 0.5600000023841858,
"rewards/chosen": 0.03777886554598808,
"rewards/margins": 0.011078822426497936,
"rewards/rejected": 0.02670004405081272,
"step": 1250
},
{
"epoch": 0.04185954890180242,
"grad_norm": 97.0,
"learning_rate": 4.939075985784398e-06,
"logits/chosen": -4.040958881378174,
"logits/rejected": -4.270505905151367,
"logps/chosen": -622.3072509765625,
"logps/rejected": -454.3692932128906,
"loss": 0.6937,
"rewards/accuracies": 0.5199999809265137,
"rewards/chosen": 0.05720474198460579,
"rewards/margins": 0.0045347679406404495,
"rewards/rejected": 0.05266997963190079,
"step": 1275
},
{
"epoch": 0.04268032437046521,
"grad_norm": 159.0,
"learning_rate": 4.934845151463869e-06,
"logits/chosen": -4.0371527671813965,
"logits/rejected": -4.051915168762207,
"logps/chosen": -697.654296875,
"logps/rejected": -547.8112182617188,
"loss": 0.6924,
"rewards/accuracies": 0.5400000214576721,
"rewards/chosen": 0.06825733929872513,
"rewards/margins": 0.005310583859682083,
"rewards/rejected": 0.06294675171375275,
"step": 1300
},
{
"epoch": 0.04350109983912801,
"grad_norm": 117.5,
"learning_rate": 4.930614317143341e-06,
"logits/chosen": -4.278604507446289,
"logits/rejected": -4.323906421661377,
"logps/chosen": -667.7477416992188,
"logps/rejected": -371.2331237792969,
"loss": 0.6782,
"rewards/accuracies": 0.6600000262260437,
"rewards/chosen": 0.0644068494439125,
"rewards/margins": 0.03309326991438866,
"rewards/rejected": 0.03131357580423355,
"step": 1325
},
{
"epoch": 0.0443218753077908,
"grad_norm": 173.0,
"learning_rate": 4.926383482822813e-06,
"logits/chosen": -4.178940296173096,
"logits/rejected": -4.232710361480713,
"logps/chosen": -695.1062622070312,
"logps/rejected": -554.77734375,
"loss": 0.6671,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.0913730263710022,
"rewards/margins": 0.05747390165925026,
"rewards/rejected": 0.03389911353588104,
"step": 1350
},
{
"epoch": 0.045142650776453595,
"grad_norm": 158.0,
"learning_rate": 4.922152648502285e-06,
"logits/chosen": -4.029209136962891,
"logits/rejected": -4.280393123626709,
"logps/chosen": -510.9493103027344,
"logps/rejected": -371.9337463378906,
"loss": 0.6774,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.05904337763786316,
"rewards/margins": 0.03466322645545006,
"rewards/rejected": 0.024380149319767952,
"step": 1375
},
{
"epoch": 0.04596342624511639,
"grad_norm": 173.0,
"learning_rate": 4.917921814181757e-06,
"logits/chosen": -4.22706413269043,
"logits/rejected": -4.331350326538086,
"logps/chosen": -816.2857666015625,
"logps/rejected": -513.9669189453125,
"loss": 0.6742,
"rewards/accuracies": 0.6600000262260437,
"rewards/chosen": 0.06091469153761864,
"rewards/margins": 0.0425628200173378,
"rewards/rejected": 0.018351875245571136,
"step": 1400
},
{
"epoch": 0.04678420171377918,
"grad_norm": 101.0,
"learning_rate": 4.913690979861229e-06,
"logits/chosen": -4.167561054229736,
"logits/rejected": -4.328184127807617,
"logps/chosen": -795.4500122070312,
"logps/rejected": -513.28515625,
"loss": 0.6806,
"rewards/accuracies": 0.6200000047683716,
"rewards/chosen": 0.05300300940871239,
"rewards/margins": 0.029740547761321068,
"rewards/rejected": 0.023262467235326767,
"step": 1425
},
{
"epoch": 0.04760497718244197,
"grad_norm": 228.0,
"learning_rate": 4.909460145540701e-06,
"logits/chosen": -4.266139507293701,
"logits/rejected": -4.2949018478393555,
"logps/chosen": -818.7105712890625,
"logps/rejected": -544.2308349609375,
"loss": 0.6714,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.06837845593690872,
"rewards/margins": 0.04987398162484169,
"rewards/rejected": 0.01850447617471218,
"step": 1450
},
{
"epoch": 0.04842575265110476,
"grad_norm": 171.0,
"learning_rate": 4.905229311220173e-06,
"logits/chosen": -4.202406406402588,
"logits/rejected": -4.179924964904785,
"logps/chosen": -740.6281127929688,
"logps/rejected": -558.76416015625,
"loss": 0.6772,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.08042413741350174,
"rewards/margins": 0.03836137428879738,
"rewards/rejected": 0.042062774300575256,
"step": 1475
},
{
"epoch": 0.049246528119767555,
"grad_norm": 158.0,
"learning_rate": 4.900998476899645e-06,
"logits/chosen": -4.150285243988037,
"logits/rejected": -4.043075084686279,
"logps/chosen": -907.114501953125,
"logps/rejected": -761.865234375,
"loss": 0.6657,
"rewards/accuracies": 0.6600000262260437,
"rewards/chosen": 0.08876218646764755,
"rewards/margins": 0.06508920341730118,
"rewards/rejected": 0.02367297373712063,
"step": 1500
},
{
"epoch": 0.05006730358843035,
"grad_norm": 296.0,
"learning_rate": 4.896767642579117e-06,
"logits/chosen": -4.109315395355225,
"logits/rejected": -4.170251846313477,
"logps/chosen": -897.26953125,
"logps/rejected": -622.68310546875,
"loss": 0.6654,
"rewards/accuracies": 0.6200000047683716,
"rewards/chosen": 0.09235405921936035,
"rewards/margins": 0.062233816832304,
"rewards/rejected": 0.0301202479749918,
"step": 1525
},
{
"epoch": 0.05088807905709314,
"grad_norm": 205.0,
"learning_rate": 4.892536808258589e-06,
"logits/chosen": -4.138393402099609,
"logits/rejected": -4.094308376312256,
"logps/chosen": -725.5248413085938,
"logps/rejected": -577.0723266601562,
"loss": 0.6771,
"rewards/accuracies": 0.6600000262260437,
"rewards/chosen": 0.07355516403913498,
"rewards/margins": 0.03712104633450508,
"rewards/rejected": 0.0364341177046299,
"step": 1550
},
{
"epoch": 0.05170885452575594,
"grad_norm": 324.0,
"learning_rate": 4.888305973938061e-06,
"logits/chosen": -4.188644886016846,
"logits/rejected": -4.159783840179443,
"logps/chosen": -896.7723388671875,
"logps/rejected": -618.6541748046875,
"loss": 0.6671,
"rewards/accuracies": 0.6399999856948853,
"rewards/chosen": 0.09490902721881866,
"rewards/margins": 0.060124482959508896,
"rewards/rejected": 0.03478454798460007,
"step": 1575
},
{
"epoch": 0.05252962999441873,
"grad_norm": 150.0,
"learning_rate": 4.884075139617533e-06,
"logits/chosen": -4.2158002853393555,
"logits/rejected": -4.447061061859131,
"logps/chosen": -633.4708862304688,
"logps/rejected": -464.6991271972656,
"loss": 0.6719,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.08528276532888412,
"rewards/margins": 0.04901896044611931,
"rewards/rejected": 0.036263808608055115,
"step": 1600
},
{
"epoch": 0.05335040546308152,
"grad_norm": 288.0,
"learning_rate": 4.879844305297005e-06,
"logits/chosen": -4.165538311004639,
"logits/rejected": -4.247755527496338,
"logps/chosen": -781.2611694335938,
"logps/rejected": -623.4193115234375,
"loss": 0.6521,
"rewards/accuracies": 0.6800000071525574,
"rewards/chosen": 0.1412876844406128,
"rewards/margins": 0.09663904458284378,
"rewards/rejected": 0.044648658484220505,
"step": 1625
},
{
"epoch": 0.05417118093174431,
"grad_norm": 152.0,
"learning_rate": 4.875613470976477e-06,
"logits/chosen": -4.050336837768555,
"logits/rejected": -4.09099817276001,
"logps/chosen": -526.436767578125,
"logps/rejected": -486.9073181152344,
"loss": 0.6864,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.07214751839637756,
"rewards/margins": 0.0187962856143713,
"rewards/rejected": 0.05335123464465141,
"step": 1650
},
{
"epoch": 0.054991956400407105,
"grad_norm": 340.0,
"learning_rate": 4.871382636655949e-06,
"logits/chosen": -4.108943462371826,
"logits/rejected": -4.278311252593994,
"logps/chosen": -790.7910766601562,
"logps/rejected": -563.6265869140625,
"loss": 0.6585,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.08027210086584091,
"rewards/margins": 0.07762635499238968,
"rewards/rejected": 0.002645747736096382,
"step": 1675
},
{
"epoch": 0.0558127318690699,
"grad_norm": 185.0,
"learning_rate": 4.867151802335421e-06,
"logits/chosen": -4.215210437774658,
"logits/rejected": -4.298604965209961,
"logps/chosen": -722.01025390625,
"logps/rejected": -528.048583984375,
"loss": 0.6605,
"rewards/accuracies": 0.6800000071525574,
"rewards/chosen": 0.11607293039560318,
"rewards/margins": 0.07552981376647949,
"rewards/rejected": 0.04054312780499458,
"step": 1700
},
{
"epoch": 0.05663350733773269,
"grad_norm": 278.0,
"learning_rate": 4.862920968014893e-06,
"logits/chosen": -4.231524467468262,
"logits/rejected": -4.164397716522217,
"logps/chosen": -781.760009765625,
"logps/rejected": -626.20751953125,
"loss": 0.6602,
"rewards/accuracies": 0.6600000262260437,
"rewards/chosen": 0.11867771297693253,
"rewards/margins": 0.07886005192995071,
"rewards/rejected": 0.03981764614582062,
"step": 1725
},
{
"epoch": 0.05745428280639548,
"grad_norm": 198.0,
"learning_rate": 4.858690133694365e-06,
"logits/chosen": -4.20419454574585,
"logits/rejected": -4.252129554748535,
"logps/chosen": -770.19873046875,
"logps/rejected": -581.0945434570312,
"loss": 0.6478,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.09692225605249405,
"rewards/margins": 0.10454078763723373,
"rewards/rejected": -0.0076185232028365135,
"step": 1750
},
{
"epoch": 0.05827505827505827,
"grad_norm": 252.0,
"learning_rate": 4.854459299373837e-06,
"logits/chosen": -4.024634838104248,
"logits/rejected": -4.1678338050842285,
"logps/chosen": -713.2058715820312,
"logps/rejected": -553.06591796875,
"loss": 0.6573,
"rewards/accuracies": 0.5600000023841858,
"rewards/chosen": 0.11371553689241409,
"rewards/margins": 0.08740261197090149,
"rewards/rejected": 0.026312928646802902,
"step": 1775
},
{
"epoch": 0.059095833743721064,
"grad_norm": 197.0,
"learning_rate": 4.850228465053309e-06,
"logits/chosen": -4.1599507331848145,
"logits/rejected": -4.157556056976318,
"logps/chosen": -475.4374084472656,
"logps/rejected": -405.504150390625,
"loss": 0.6723,
"rewards/accuracies": 0.6200000047683716,
"rewards/chosen": 0.034932468086481094,
"rewards/margins": 0.04883403331041336,
"rewards/rejected": -0.01390156988054514,
"step": 1800
},
{
"epoch": 0.05991660921238386,
"grad_norm": 218.0,
"learning_rate": 4.845997630732781e-06,
"logits/chosen": -4.242368698120117,
"logits/rejected": -4.189089775085449,
"logps/chosen": -717.33154296875,
"logps/rejected": -426.8397216796875,
"loss": 0.6618,
"rewards/accuracies": 0.6600000262260437,
"rewards/chosen": 0.11887142807245255,
"rewards/margins": 0.07003708183765411,
"rewards/rejected": 0.048834361135959625,
"step": 1825
},
{
"epoch": 0.060737384681046655,
"grad_norm": 165.0,
"learning_rate": 4.841766796412253e-06,
"logits/chosen": -4.1461181640625,
"logits/rejected": -4.220346450805664,
"logps/chosen": -646.8983764648438,
"logps/rejected": -572.834228515625,
"loss": 0.6754,
"rewards/accuracies": 0.5799999833106995,
"rewards/chosen": 0.08920831978321075,
"rewards/margins": 0.04618198052048683,
"rewards/rejected": 0.04302635043859482,
"step": 1850
},
{
"epoch": 0.06155816014970945,
"grad_norm": 320.0,
"learning_rate": 4.837535962091725e-06,
"logits/chosen": -4.157592296600342,
"logits/rejected": -4.301741123199463,
"logps/chosen": -965.8426513671875,
"logps/rejected": -738.236083984375,
"loss": 0.651,
"rewards/accuracies": 0.6800000071525574,
"rewards/chosen": 0.14710073173046112,
"rewards/margins": 0.09817076474428177,
"rewards/rejected": 0.048929959535598755,
"step": 1875
},
{
"epoch": 0.06237893561837224,
"grad_norm": 186.0,
"learning_rate": 4.833305127771197e-06,
"logits/chosen": -4.285674571990967,
"logits/rejected": -4.307552337646484,
"logps/chosen": -676.7159423828125,
"logps/rejected": -584.8186645507812,
"loss": 0.6319,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.08745526522397995,
"rewards/margins": 0.14058372378349304,
"rewards/rejected": -0.05312845855951309,
"step": 1900
},
{
"epoch": 0.06319971108703504,
"grad_norm": 217.0,
"learning_rate": 4.829074293450669e-06,
"logits/chosen": -4.085958480834961,
"logits/rejected": -4.113046169281006,
"logps/chosen": -686.4339599609375,
"logps/rejected": -575.382568359375,
"loss": 0.6642,
"rewards/accuracies": 0.6399999856948853,
"rewards/chosen": 0.06624173372983932,
"rewards/margins": 0.06577203422784805,
"rewards/rejected": 0.0004696959222201258,
"step": 1925
},
{
"epoch": 0.06402048655569782,
"grad_norm": 175.0,
"learning_rate": 4.824843459130141e-06,
"logits/chosen": -4.064718723297119,
"logits/rejected": -4.079798698425293,
"logps/chosen": -667.0342407226562,
"logps/rejected": -558.95947265625,
"loss": 0.6847,
"rewards/accuracies": 0.5799999833106995,
"rewards/chosen": 0.07132381200790405,
"rewards/margins": 0.031461507081985474,
"rewards/rejected": 0.03986230492591858,
"step": 1950
},
{
"epoch": 0.06484126202436062,
"grad_norm": 151.0,
"learning_rate": 4.820612624809613e-06,
"logits/chosen": -4.2754926681518555,
"logits/rejected": -4.281916618347168,
"logps/chosen": -924.2509155273438,
"logps/rejected": -606.2994384765625,
"loss": 0.6524,
"rewards/accuracies": 0.6800000071525574,
"rewards/chosen": 0.0693202093243599,
"rewards/margins": 0.10057316720485687,
"rewards/rejected": -0.03125295788049698,
"step": 1975
},
{
"epoch": 0.0656620374930234,
"grad_norm": 142.0,
"learning_rate": 4.816381790489085e-06,
"logits/chosen": -3.999875545501709,
"logits/rejected": -4.053827285766602,
"logps/chosen": -583.6886596679688,
"logps/rejected": -434.0615234375,
"loss": 0.6805,
"rewards/accuracies": 0.5799999833106995,
"rewards/chosen": 0.04275403916835785,
"rewards/margins": 0.04212899133563042,
"rewards/rejected": 0.000625052722170949,
"step": 2000
},
{
"epoch": 0.0664828129616862,
"grad_norm": 216.0,
"learning_rate": 4.812150956168557e-06,
"logits/chosen": -4.0387349128723145,
"logits/rejected": -4.308966636657715,
"logps/chosen": -707.042724609375,
"logps/rejected": -579.7852172851562,
"loss": 0.6822,
"rewards/accuracies": 0.5600000023841858,
"rewards/chosen": 0.024804413318634033,
"rewards/margins": 0.03053770214319229,
"rewards/rejected": -0.005733292084187269,
"step": 2025
},
{
"epoch": 0.06730358843034899,
"grad_norm": 280.0,
"learning_rate": 4.807920121848029e-06,
"logits/chosen": -4.186931133270264,
"logits/rejected": -4.16575813293457,
"logps/chosen": -772.7561645507812,
"logps/rejected": -545.01708984375,
"loss": 0.6805,
"rewards/accuracies": 0.5400000214576721,
"rewards/chosen": 0.06507274508476257,
"rewards/margins": 0.050922200083732605,
"rewards/rejected": 0.014150548726320267,
"step": 2050
},
{
"epoch": 0.06812436389901179,
"grad_norm": 174.0,
"learning_rate": 4.803689287527501e-06,
"logits/chosen": -4.136441707611084,
"logits/rejected": -4.354907035827637,
"logps/chosen": -664.2943725585938,
"logps/rejected": -509.4131164550781,
"loss": 0.6535,
"rewards/accuracies": 0.6600000262260437,
"rewards/chosen": 0.0457613430917263,
"rewards/margins": 0.09452405571937561,
"rewards/rejected": -0.048762716352939606,
"step": 2075
},
{
"epoch": 0.06894513936767457,
"grad_norm": 125.0,
"learning_rate": 4.799458453206973e-06,
"logits/chosen": -4.2889180183410645,
"logits/rejected": -4.377254486083984,
"logps/chosen": -819.005615234375,
"logps/rejected": -547.5381469726562,
"loss": 0.6506,
"rewards/accuracies": 0.6600000262260437,
"rewards/chosen": 0.08488751202821732,
"rewards/margins": 0.10479531437158585,
"rewards/rejected": -0.019907798618078232,
"step": 2100
},
{
"epoch": 0.06976591483633737,
"grad_norm": 138.0,
"learning_rate": 4.795227618886445e-06,
"logits/chosen": -4.016672134399414,
"logits/rejected": -4.22638463973999,
"logps/chosen": -694.584716796875,
"logps/rejected": -532.6060791015625,
"loss": 0.6714,
"rewards/accuracies": 0.5400000214576721,
"rewards/chosen": 0.025129133835434914,
"rewards/margins": 0.058808211237192154,
"rewards/rejected": -0.03367907553911209,
"step": 2125
},
{
"epoch": 0.07058669030500016,
"grad_norm": 154.0,
"learning_rate": 4.790996784565917e-06,
"logits/chosen": -4.167966365814209,
"logits/rejected": -4.243107795715332,
"logps/chosen": -710.6177978515625,
"logps/rejected": -499.143798828125,
"loss": 0.6524,
"rewards/accuracies": 0.6800000071525574,
"rewards/chosen": 0.10299518704414368,
"rewards/margins": 0.10052691400051117,
"rewards/rejected": 0.002468266524374485,
"step": 2150
},
{
"epoch": 0.07140746577366296,
"grad_norm": 255.0,
"learning_rate": 4.786765950245389e-06,
"logits/chosen": -4.177813529968262,
"logits/rejected": -4.1956963539123535,
"logps/chosen": -716.7862548828125,
"logps/rejected": -503.1024475097656,
"loss": 0.6542,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.0937609001994133,
"rewards/margins": 0.0912182629108429,
"rewards/rejected": 0.0025426370557397604,
"step": 2175
},
{
"epoch": 0.07222824124232576,
"grad_norm": 221.0,
"learning_rate": 4.782535115924861e-06,
"logits/chosen": -4.274630546569824,
"logits/rejected": -4.261683940887451,
"logps/chosen": -825.6651000976562,
"logps/rejected": -622.1373291015625,
"loss": 0.6467,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.13351120054721832,
"rewards/margins": 0.11293386667966843,
"rewards/rejected": 0.020577318966388702,
"step": 2200
},
{
"epoch": 0.07304901671098854,
"grad_norm": 153.0,
"learning_rate": 4.778304281604333e-06,
"logits/chosen": -4.295671463012695,
"logits/rejected": -4.270346164703369,
"logps/chosen": -703.2100219726562,
"logps/rejected": -497.50555419921875,
"loss": 0.6664,
"rewards/accuracies": 0.7200000286102295,
"rewards/chosen": 0.06314626336097717,
"rewards/margins": 0.06403078138828278,
"rewards/rejected": -0.0008845186093822122,
"step": 2225
},
{
"epoch": 0.07386979217965134,
"grad_norm": 148.0,
"learning_rate": 4.774073447283805e-06,
"logits/chosen": -4.128739356994629,
"logits/rejected": -4.267156600952148,
"logps/chosen": -723.7312622070312,
"logps/rejected": -558.724853515625,
"loss": 0.6135,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.1362067013978958,
"rewards/margins": 0.18841738998889923,
"rewards/rejected": -0.05221069976687431,
"step": 2250
},
{
"epoch": 0.07469056764831412,
"grad_norm": 172.0,
"learning_rate": 4.769842612963277e-06,
"logits/chosen": -4.256707191467285,
"logits/rejected": -4.36817741394043,
"logps/chosen": -731.4835815429688,
"logps/rejected": -561.2887573242188,
"loss": 0.6452,
"rewards/accuracies": 0.6800000071525574,
"rewards/chosen": 0.06940807402133942,
"rewards/margins": 0.11642622202634811,
"rewards/rejected": -0.0470181368291378,
"step": 2275
},
{
"epoch": 0.07551134311697692,
"grad_norm": 163.0,
"learning_rate": 4.765611778642749e-06,
"logits/chosen": -4.106626510620117,
"logits/rejected": -4.237212657928467,
"logps/chosen": -624.831298828125,
"logps/rejected": -555.9198608398438,
"loss": 0.6602,
"rewards/accuracies": 0.6800000071525574,
"rewards/chosen": 0.06575372815132141,
"rewards/margins": 0.08808085322380066,
"rewards/rejected": -0.022327115759253502,
"step": 2300
},
{
"epoch": 0.07633211858563971,
"grad_norm": 134.0,
"learning_rate": 4.761380944322221e-06,
"logits/chosen": -4.047150135040283,
"logits/rejected": -4.1057209968566895,
"logps/chosen": -701.8414306640625,
"logps/rejected": -535.3977661132812,
"loss": 0.6631,
"rewards/accuracies": 0.7200000286102295,
"rewards/chosen": 0.09485507011413574,
"rewards/margins": 0.0826391950249672,
"rewards/rejected": 0.012215878814458847,
"step": 2325
},
{
"epoch": 0.0771528940543025,
"grad_norm": 274.0,
"learning_rate": 4.757150110001693e-06,
"logits/chosen": -4.054962635040283,
"logits/rejected": -4.135710716247559,
"logps/chosen": -729.4888305664062,
"logps/rejected": -625.6478881835938,
"loss": 0.6432,
"rewards/accuracies": 0.5600000023841858,
"rewards/chosen": 0.09774763882160187,
"rewards/margins": 0.12367933988571167,
"rewards/rejected": -0.02593171037733555,
"step": 2350
},
{
"epoch": 0.07797366952296529,
"grad_norm": 354.0,
"learning_rate": 4.752919275681165e-06,
"logits/chosen": -4.054114818572998,
"logits/rejected": -4.159776210784912,
"logps/chosen": -829.9243774414062,
"logps/rejected": -499.4671630859375,
"loss": 0.6425,
"rewards/accuracies": 0.6600000262260437,
"rewards/chosen": 0.11411032825708389,
"rewards/margins": 0.1236293613910675,
"rewards/rejected": -0.009519041515886784,
"step": 2375
},
{
"epoch": 0.07879444499162809,
"grad_norm": 125.0,
"learning_rate": 4.748688441360637e-06,
"logits/chosen": -4.200274467468262,
"logits/rejected": -4.268646240234375,
"logps/chosen": -702.6317749023438,
"logps/rejected": -487.8554992675781,
"loss": 0.6509,
"rewards/accuracies": 0.6399999856948853,
"rewards/chosen": 0.08645128458738327,
"rewards/margins": 0.10850445926189423,
"rewards/rejected": -0.022053170949220657,
"step": 2400
},
{
"epoch": 0.07961522046029089,
"grad_norm": 158.0,
"learning_rate": 4.744457607040109e-06,
"logits/chosen": -4.188302516937256,
"logits/rejected": -4.178359508514404,
"logps/chosen": -740.6657104492188,
"logps/rejected": -549.1505126953125,
"loss": 0.6553,
"rewards/accuracies": 0.6600000262260437,
"rewards/chosen": 0.08103080838918686,
"rewards/margins": 0.09224209934473038,
"rewards/rejected": -0.011211306788027287,
"step": 2425
},
{
"epoch": 0.08043599592895367,
"grad_norm": 108.0,
"learning_rate": 4.740226772719581e-06,
"logits/chosen": -3.956430673599243,
"logits/rejected": -4.144464015960693,
"logps/chosen": -579.575439453125,
"logps/rejected": -490.2182922363281,
"loss": 0.611,
"rewards/accuracies": 0.8199999928474426,
"rewards/chosen": 0.14252804219722748,
"rewards/margins": 0.18614298105239868,
"rewards/rejected": -0.043614912778139114,
"step": 2450
},
{
"epoch": 0.08125677139761647,
"grad_norm": 223.0,
"learning_rate": 4.735995938399053e-06,
"logits/chosen": -4.095440864562988,
"logits/rejected": -4.225205898284912,
"logps/chosen": -797.9213256835938,
"logps/rejected": -592.8213500976562,
"loss": 0.6548,
"rewards/accuracies": 0.5600000023841858,
"rewards/chosen": 0.10039664059877396,
"rewards/margins": 0.10022265464067459,
"rewards/rejected": 0.00017398863565176725,
"step": 2475
},
{
"epoch": 0.08207754686627926,
"grad_norm": 109.0,
"learning_rate": 4.731765104078525e-06,
"logits/chosen": -4.233191013336182,
"logits/rejected": -4.310283184051514,
"logps/chosen": -621.84326171875,
"logps/rejected": -474.5871887207031,
"loss": 0.6607,
"rewards/accuracies": 0.6600000262260437,
"rewards/chosen": 0.09741487354040146,
"rewards/margins": 0.08807798475027084,
"rewards/rejected": 0.009336878545582294,
"step": 2500
},
{
"epoch": 0.08289832233494206,
"grad_norm": 158.0,
"learning_rate": 4.727534269757997e-06,
"logits/chosen": -4.1276421546936035,
"logits/rejected": -4.145913600921631,
"logps/chosen": -729.427001953125,
"logps/rejected": -513.8289794921875,
"loss": 0.6248,
"rewards/accuracies": 0.6800000071525574,
"rewards/chosen": 0.11118257790803909,
"rewards/margins": 0.16082428395748138,
"rewards/rejected": -0.0496416911482811,
"step": 2525
},
{
"epoch": 0.08371909780360484,
"grad_norm": 117.0,
"learning_rate": 4.723303435437469e-06,
"logits/chosen": -4.257935047149658,
"logits/rejected": -4.259277820587158,
"logps/chosen": -620.875,
"logps/rejected": -509.9112548828125,
"loss": 0.6807,
"rewards/accuracies": 0.5799999833106995,
"rewards/chosen": 0.06533210724592209,
"rewards/margins": 0.039884164929389954,
"rewards/rejected": 0.025447946041822433,
"step": 2550
},
{
"epoch": 0.08453987327226764,
"grad_norm": 142.0,
"learning_rate": 4.719072601116941e-06,
"logits/chosen": -4.256199836730957,
"logits/rejected": -4.267094612121582,
"logps/chosen": -683.416748046875,
"logps/rejected": -555.82373046875,
"loss": 0.6384,
"rewards/accuracies": 0.7799999713897705,
"rewards/chosen": 0.10108324140310287,
"rewards/margins": 0.11987866461277008,
"rewards/rejected": -0.018795425072312355,
"step": 2575
},
{
"epoch": 0.08536064874093043,
"grad_norm": 156.0,
"learning_rate": 4.714841766796413e-06,
"logits/chosen": -4.165497303009033,
"logits/rejected": -4.2554426193237305,
"logps/chosen": -523.2576904296875,
"logps/rejected": -401.132080078125,
"loss": 0.6547,
"rewards/accuracies": 0.6800000071525574,
"rewards/chosen": 0.0769733414053917,
"rewards/margins": 0.08824872970581055,
"rewards/rejected": -0.011275377124547958,
"step": 2600
},
{
"epoch": 0.08618142420959322,
"grad_norm": 207.0,
"learning_rate": 4.710610932475885e-06,
"logits/chosen": -4.149517059326172,
"logits/rejected": -4.061563491821289,
"logps/chosen": -748.7947387695312,
"logps/rejected": -543.3330688476562,
"loss": 0.647,
"rewards/accuracies": 0.6800000071525574,
"rewards/chosen": 0.08717348426580429,
"rewards/margins": 0.11688578873872757,
"rewards/rejected": -0.029712295159697533,
"step": 2625
},
{
"epoch": 0.08700219967825602,
"grad_norm": 165.0,
"learning_rate": 4.706380098155357e-06,
"logits/chosen": -4.1994500160217285,
"logits/rejected": -4.425838470458984,
"logps/chosen": -723.5103149414062,
"logps/rejected": -516.3181762695312,
"loss": 0.6455,
"rewards/accuracies": 0.7799999713897705,
"rewards/chosen": 0.06696411967277527,
"rewards/margins": 0.11419793963432312,
"rewards/rejected": -0.04723381623625755,
"step": 2650
},
{
"epoch": 0.08782297514691881,
"grad_norm": 276.0,
"learning_rate": 4.702149263834829e-06,
"logits/chosen": -4.170817852020264,
"logits/rejected": -4.261829376220703,
"logps/chosen": -769.914306640625,
"logps/rejected": -620.16845703125,
"loss": 0.6665,
"rewards/accuracies": 0.5799999833106995,
"rewards/chosen": 0.13049346208572388,
"rewards/margins": 0.0769289955496788,
"rewards/rejected": 0.053564462810754776,
"step": 2675
},
{
"epoch": 0.0886437506155816,
"grad_norm": 318.0,
"learning_rate": 4.697918429514301e-06,
"logits/chosen": -3.8616783618927,
"logits/rejected": -4.007978439331055,
"logps/chosen": -709.0524291992188,
"logps/rejected": -439.6181640625,
"loss": 0.6312,
"rewards/accuracies": 0.7200000286102295,
"rewards/chosen": 0.10517993569374084,
"rewards/margins": 0.14700084924697876,
"rewards/rejected": -0.041820917278528214,
"step": 2700
},
{
"epoch": 0.08946452608424439,
"grad_norm": 152.0,
"learning_rate": 4.693687595193773e-06,
"logits/chosen": -4.251624584197998,
"logits/rejected": -4.1947197914123535,
"logps/chosen": -780.1447143554688,
"logps/rejected": -565.7095947265625,
"loss": 0.6175,
"rewards/accuracies": 0.7400000095367432,
"rewards/chosen": 0.12333157658576965,
"rewards/margins": 0.19305434823036194,
"rewards/rejected": -0.06972277909517288,
"step": 2725
},
{
"epoch": 0.09028530155290719,
"grad_norm": 111.0,
"learning_rate": 4.689456760873245e-06,
"logits/chosen": -4.107117652893066,
"logits/rejected": -4.242709636688232,
"logps/chosen": -627.9918212890625,
"logps/rejected": -433.5965881347656,
"loss": 0.6195,
"rewards/accuracies": 0.7200000286102295,
"rewards/chosen": 0.08907662332057953,
"rewards/margins": 0.17075376212596893,
"rewards/rejected": -0.08167713135480881,
"step": 2750
},
{
"epoch": 0.09110607702156998,
"grad_norm": 169.0,
"learning_rate": 4.685225926552717e-06,
"logits/chosen": -4.189664840698242,
"logits/rejected": -4.309139728546143,
"logps/chosen": -799.1683349609375,
"logps/rejected": -554.3936767578125,
"loss": 0.6118,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.1505787968635559,
"rewards/margins": 0.19868890941143036,
"rewards/rejected": -0.048110123723745346,
"step": 2775
},
{
"epoch": 0.09192685249023277,
"grad_norm": 187.0,
"learning_rate": 4.680995092232189e-06,
"logits/chosen": -4.031192779541016,
"logits/rejected": -4.0329108238220215,
"logps/chosen": -847.0072631835938,
"logps/rejected": -591.4379272460938,
"loss": 0.6511,
"rewards/accuracies": 0.6800000071525574,
"rewards/chosen": 0.07973573356866837,
"rewards/margins": 0.11566194891929626,
"rewards/rejected": -0.0359262190759182,
"step": 2800
},
{
"epoch": 0.09274762795889556,
"grad_norm": 232.0,
"learning_rate": 4.676764257911661e-06,
"logits/chosen": -4.184224605560303,
"logits/rejected": -4.380885124206543,
"logps/chosen": -766.1116943359375,
"logps/rejected": -525.26318359375,
"loss": 0.6456,
"rewards/accuracies": 0.5799999833106995,
"rewards/chosen": 0.12456972897052765,
"rewards/margins": 0.12566854059696198,
"rewards/rejected": -0.0010988200083374977,
"step": 2825
},
{
"epoch": 0.09356840342755836,
"grad_norm": 254.0,
"learning_rate": 4.672533423591132e-06,
"logits/chosen": -4.144000053405762,
"logits/rejected": -4.1139092445373535,
"logps/chosen": -696.5800170898438,
"logps/rejected": -422.58770751953125,
"loss": 0.6521,
"rewards/accuracies": 0.6200000047683716,
"rewards/chosen": 0.05194849520921707,
"rewards/margins": 0.11492128670215607,
"rewards/rejected": -0.0629727840423584,
"step": 2850
},
{
"epoch": 0.09438917889622114,
"grad_norm": 131.0,
"learning_rate": 4.668302589270605e-06,
"logits/chosen": -4.328439712524414,
"logits/rejected": -4.420200347900391,
"logps/chosen": -755.8521728515625,
"logps/rejected": -508.4173889160156,
"loss": 0.6161,
"rewards/accuracies": 0.7799999713897705,
"rewards/chosen": 0.15212687849998474,
"rewards/margins": 0.18056415021419525,
"rewards/rejected": -0.028437262400984764,
"step": 2875
},
{
"epoch": 0.09520995436488394,
"grad_norm": 129.0,
"learning_rate": 4.664071754950076e-06,
"logits/chosen": -4.203272819519043,
"logits/rejected": -4.173132419586182,
"logps/chosen": -728.8046264648438,
"logps/rejected": -540.3849487304688,
"loss": 0.597,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.1777937114238739,
"rewards/margins": 0.2252504974603653,
"rewards/rejected": -0.04745679721236229,
"step": 2900
},
{
"epoch": 0.09603072983354674,
"grad_norm": 99.0,
"learning_rate": 4.659840920629549e-06,
"logits/chosen": -4.030066967010498,
"logits/rejected": -4.034922122955322,
"logps/chosen": -724.56689453125,
"logps/rejected": -503.8728942871094,
"loss": 0.6176,
"rewards/accuracies": 0.7799999713897705,
"rewards/chosen": 0.14354346692562103,
"rewards/margins": 0.17834116518497467,
"rewards/rejected": -0.034797683358192444,
"step": 2925
},
{
"epoch": 0.09685150530220953,
"grad_norm": 374.0,
"learning_rate": 4.65561008630902e-06,
"logits/chosen": -4.154969692230225,
"logits/rejected": -4.25916862487793,
"logps/chosen": -688.744873046875,
"logps/rejected": -566.5956420898438,
"loss": 0.6823,
"rewards/accuracies": 0.5600000023841858,
"rewards/chosen": 0.0816277414560318,
"rewards/margins": 0.05097034573554993,
"rewards/rejected": 0.03065740317106247,
"step": 2950
},
{
"epoch": 0.09767228077087232,
"grad_norm": 226.0,
"learning_rate": 4.651379251988493e-06,
"logits/chosen": -4.115259170532227,
"logits/rejected": -4.045093536376953,
"logps/chosen": -814.7285766601562,
"logps/rejected": -566.1422119140625,
"loss": 0.6329,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.1374712586402893,
"rewards/margins": 0.15288038551807404,
"rewards/rejected": -0.015409140847623348,
"step": 2975
},
{
"epoch": 0.09849305623953511,
"grad_norm": 180.0,
"learning_rate": 4.647148417667964e-06,
"logits/chosen": -3.97808837890625,
"logits/rejected": -4.152464389801025,
"logps/chosen": -530.7527465820312,
"logps/rejected": -402.433349609375,
"loss": 0.6348,
"rewards/accuracies": 0.7400000095367432,
"rewards/chosen": 0.10918596386909485,
"rewards/margins": 0.13875608146190643,
"rewards/rejected": -0.02957012504339218,
"step": 3000
},
{
"epoch": 0.09931383170819791,
"grad_norm": 183.0,
"learning_rate": 4.642917583347437e-06,
"logits/chosen": -4.104302883148193,
"logits/rejected": -4.1437273025512695,
"logps/chosen": -675.0985717773438,
"logps/rejected": -486.0548095703125,
"loss": 0.6402,
"rewards/accuracies": 0.6200000047683716,
"rewards/chosen": 0.10294067114591599,
"rewards/margins": 0.13575658202171326,
"rewards/rejected": -0.03281591460108757,
"step": 3025
},
{
"epoch": 0.1001346071768607,
"grad_norm": 135.0,
"learning_rate": 4.638686749026908e-06,
"logits/chosen": -4.05023193359375,
"logits/rejected": -4.130072593688965,
"logps/chosen": -598.7313232421875,
"logps/rejected": -418.406982421875,
"loss": 0.652,
"rewards/accuracies": 0.7400000095367432,
"rewards/chosen": 0.08227918297052383,
"rewards/margins": 0.10548584908246994,
"rewards/rejected": -0.023206667974591255,
"step": 3050
},
{
"epoch": 0.10095538264552349,
"grad_norm": 155.0,
"learning_rate": 4.634455914706381e-06,
"logits/chosen": -4.130331039428711,
"logits/rejected": -4.150590896606445,
"logps/chosen": -792.5228881835938,
"logps/rejected": -588.8016967773438,
"loss": 0.6174,
"rewards/accuracies": 0.6399999856948853,
"rewards/chosen": 0.12860740721225739,
"rewards/margins": 0.1893465369939804,
"rewards/rejected": -0.06073914095759392,
"step": 3075
},
{
"epoch": 0.10177615811418628,
"grad_norm": 187.0,
"learning_rate": 4.630225080385852e-06,
"logits/chosen": -4.148515224456787,
"logits/rejected": -4.221671104431152,
"logps/chosen": -591.18994140625,
"logps/rejected": -556.4551391601562,
"loss": 0.6459,
"rewards/accuracies": 0.6399999856948853,
"rewards/chosen": 0.06625935435295105,
"rewards/margins": 0.11798873543739319,
"rewards/rejected": -0.05172938480973244,
"step": 3100
},
{
"epoch": 0.10259693358284908,
"grad_norm": 173.0,
"learning_rate": 4.625994246065325e-06,
"logits/chosen": -4.128535747528076,
"logits/rejected": -4.260271072387695,
"logps/chosen": -821.3890380859375,
"logps/rejected": -580.5010986328125,
"loss": 0.6233,
"rewards/accuracies": 0.6600000262260437,
"rewards/chosen": 0.13844671845436096,
"rewards/margins": 0.19154323637485504,
"rewards/rejected": -0.05309649929404259,
"step": 3125
},
{
"epoch": 0.10341770905151187,
"grad_norm": 71.0,
"learning_rate": 4.621763411744796e-06,
"logits/chosen": -4.129300117492676,
"logits/rejected": -4.251578330993652,
"logps/chosen": -769.3382568359375,
"logps/rejected": -589.3805541992188,
"loss": 0.6388,
"rewards/accuracies": 0.6600000262260437,
"rewards/chosen": 0.053467120975255966,
"rewards/margins": 0.14735950529575348,
"rewards/rejected": -0.09389238059520721,
"step": 3150
},
{
"epoch": 0.10423848452017466,
"grad_norm": 124.5,
"learning_rate": 4.617532577424269e-06,
"logits/chosen": -4.02286958694458,
"logits/rejected": -4.068836212158203,
"logps/chosen": -746.35205078125,
"logps/rejected": -515.939453125,
"loss": 0.5937,
"rewards/accuracies": 0.7599999904632568,
"rewards/chosen": 0.16510872542858124,
"rewards/margins": 0.237737774848938,
"rewards/rejected": -0.07262909412384033,
"step": 3175
},
{
"epoch": 0.10505925998883746,
"grad_norm": 183.0,
"learning_rate": 4.61330174310374e-06,
"logits/chosen": -4.258195400238037,
"logits/rejected": -4.252414703369141,
"logps/chosen": -774.0881958007812,
"logps/rejected": -619.4967041015625,
"loss": 0.6142,
"rewards/accuracies": 0.7200000286102295,
"rewards/chosen": 0.1559956967830658,
"rewards/margins": 0.2075086236000061,
"rewards/rejected": -0.05151292681694031,
"step": 3200
},
{
"epoch": 0.10588003545750024,
"grad_norm": 167.0,
"learning_rate": 4.609070908783213e-06,
"logits/chosen": -4.104750156402588,
"logits/rejected": -4.103361129760742,
"logps/chosen": -654.9664916992188,
"logps/rejected": -494.8394470214844,
"loss": 0.6136,
"rewards/accuracies": 0.7200000286102295,
"rewards/chosen": 0.1408548504114151,
"rewards/margins": 0.21712234616279602,
"rewards/rejected": -0.07626748830080032,
"step": 3225
},
{
"epoch": 0.10670081092616304,
"grad_norm": 139.0,
"learning_rate": 4.604840074462684e-06,
"logits/chosen": -4.338498115539551,
"logits/rejected": -4.383465766906738,
"logps/chosen": -923.0902099609375,
"logps/rejected": -563.9151611328125,
"loss": 0.618,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.15305353701114655,
"rewards/margins": 0.19835226237773895,
"rewards/rejected": -0.04529871046543121,
"step": 3250
},
{
"epoch": 0.10752158639482583,
"grad_norm": 103.0,
"learning_rate": 4.600609240142157e-06,
"logits/chosen": -4.177867889404297,
"logits/rejected": -4.26078987121582,
"logps/chosen": -620.0000610351562,
"logps/rejected": -437.626953125,
"loss": 0.6493,
"rewards/accuracies": 0.6600000262260437,
"rewards/chosen": 0.08293507248163223,
"rewards/margins": 0.12653230130672455,
"rewards/rejected": -0.04359724000096321,
"step": 3275
},
{
"epoch": 0.10834236186348863,
"grad_norm": 134.0,
"learning_rate": 4.596378405821628e-06,
"logits/chosen": -4.094875335693359,
"logits/rejected": -4.194637775421143,
"logps/chosen": -598.10498046875,
"logps/rejected": -512.329833984375,
"loss": 0.6383,
"rewards/accuracies": 0.6200000047683716,
"rewards/chosen": 0.1556520313024521,
"rewards/margins": 0.15189100801944733,
"rewards/rejected": 0.0037610388826578856,
"step": 3300
},
{
"epoch": 0.10916313733215141,
"grad_norm": 203.0,
"learning_rate": 4.592147571501101e-06,
"logits/chosen": -4.059422492980957,
"logits/rejected": -4.308881759643555,
"logps/chosen": -587.382568359375,
"logps/rejected": -533.905029296875,
"loss": 0.6631,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.0650976300239563,
"rewards/margins": 0.08459506183862686,
"rewards/rejected": -0.01949743553996086,
"step": 3325
},
{
"epoch": 0.10998391280081421,
"grad_norm": 161.0,
"learning_rate": 4.587916737180572e-06,
"logits/chosen": -4.154107570648193,
"logits/rejected": -4.236989974975586,
"logps/chosen": -815.28173828125,
"logps/rejected": -604.0435791015625,
"loss": 0.6354,
"rewards/accuracies": 0.6600000262260437,
"rewards/chosen": 0.17469516396522522,
"rewards/margins": 0.16531267762184143,
"rewards/rejected": 0.009382497519254684,
"step": 3350
},
{
"epoch": 0.11080468826947701,
"grad_norm": 165.0,
"learning_rate": 4.583685902860045e-06,
"logits/chosen": -4.099838733673096,
"logits/rejected": -4.225775241851807,
"logps/chosen": -628.6179809570312,
"logps/rejected": -413.48553466796875,
"loss": 0.6492,
"rewards/accuracies": 0.6600000262260437,
"rewards/chosen": 0.10960385948419571,
"rewards/margins": 0.12888643145561218,
"rewards/rejected": -0.019282571971416473,
"step": 3375
},
{
"epoch": 0.1116254637381398,
"grad_norm": 286.0,
"learning_rate": 4.579455068539516e-06,
"logits/chosen": -4.12373161315918,
"logits/rejected": -4.227139472961426,
"logps/chosen": -669.8875122070312,
"logps/rejected": -554.4998168945312,
"loss": 0.6217,
"rewards/accuracies": 0.6200000047683716,
"rewards/chosen": 0.08527453243732452,
"rewards/margins": 0.1920253038406372,
"rewards/rejected": -0.10675078630447388,
"step": 3400
},
{
"epoch": 0.11244623920680259,
"grad_norm": 222.0,
"learning_rate": 4.575224234218989e-06,
"logits/chosen": -4.115863800048828,
"logits/rejected": -4.1790452003479,
"logps/chosen": -704.6529541015625,
"logps/rejected": -396.4385070800781,
"loss": 0.66,
"rewards/accuracies": 0.5400000214576721,
"rewards/chosen": 0.07934532314538956,
"rewards/margins": 0.10207292437553406,
"rewards/rejected": -0.0227276012301445,
"step": 3425
},
{
"epoch": 0.11326701467546538,
"grad_norm": 151.0,
"learning_rate": 4.57099339989846e-06,
"logits/chosen": -4.064518451690674,
"logits/rejected": -4.319989204406738,
"logps/chosen": -744.1235961914062,
"logps/rejected": -554.1798706054688,
"loss": 0.6139,
"rewards/accuracies": 0.7200000286102295,
"rewards/chosen": 0.11846752464771271,
"rewards/margins": 0.21143819391727448,
"rewards/rejected": -0.09297066926956177,
"step": 3450
},
{
"epoch": 0.11408779014412818,
"grad_norm": 155.0,
"learning_rate": 4.566762565577933e-06,
"logits/chosen": -4.110750675201416,
"logits/rejected": -4.162017822265625,
"logps/chosen": -641.3878173828125,
"logps/rejected": -398.95648193359375,
"loss": 0.6275,
"rewards/accuracies": 0.6800000071525574,
"rewards/chosen": 0.11753468215465546,
"rewards/margins": 0.16651563346385956,
"rewards/rejected": -0.0489809475839138,
"step": 3475
},
{
"epoch": 0.11490856561279096,
"grad_norm": 247.0,
"learning_rate": 4.562531731257404e-06,
"logits/chosen": -4.102660655975342,
"logits/rejected": -4.216392517089844,
"logps/chosen": -780.7814331054688,
"logps/rejected": -678.3236694335938,
"loss": 0.6189,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.17697925865650177,
"rewards/margins": 0.19066447019577026,
"rewards/rejected": -0.013685191050171852,
"step": 3500
},
{
"epoch": 0.11572934108145376,
"grad_norm": 156.0,
"learning_rate": 4.558300896936877e-06,
"logits/chosen": -4.092509746551514,
"logits/rejected": -4.2496185302734375,
"logps/chosen": -773.97607421875,
"logps/rejected": -715.7351684570312,
"loss": 0.6341,
"rewards/accuracies": 0.6600000262260437,
"rewards/chosen": 0.16046512126922607,
"rewards/margins": 0.1650882363319397,
"rewards/rejected": -0.004623092710971832,
"step": 3525
},
{
"epoch": 0.11655011655011654,
"grad_norm": 251.0,
"learning_rate": 4.554070062616348e-06,
"logits/chosen": -4.061056613922119,
"logits/rejected": -4.082568168640137,
"logps/chosen": -647.5535278320312,
"logps/rejected": -458.9394836425781,
"loss": 0.6262,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.14398512244224548,
"rewards/margins": 0.17963096499443054,
"rewards/rejected": -0.03564583137631416,
"step": 3550
},
{
"epoch": 0.11737089201877934,
"grad_norm": 230.0,
"learning_rate": 4.54983922829582e-06,
"logits/chosen": -4.078422546386719,
"logits/rejected": -4.286499977111816,
"logps/chosen": -601.8107299804688,
"logps/rejected": -559.1649780273438,
"loss": 0.6041,
"rewards/accuracies": 0.6800000071525574,
"rewards/chosen": 0.17145298421382904,
"rewards/margins": 0.21858105063438416,
"rewards/rejected": -0.04712804779410362,
"step": 3575
},
{
"epoch": 0.11819166748744213,
"grad_norm": 223.0,
"learning_rate": 4.545608393975292e-06,
"logits/chosen": -4.175769805908203,
"logits/rejected": -4.221960544586182,
"logps/chosen": -651.9328002929688,
"logps/rejected": -498.6581115722656,
"loss": 0.6304,
"rewards/accuracies": 0.6800000071525574,
"rewards/chosen": 0.10373630374670029,
"rewards/margins": 0.16331829130649567,
"rewards/rejected": -0.05958200618624687,
"step": 3600
},
{
"epoch": 0.11901244295610493,
"grad_norm": 164.0,
"learning_rate": 4.541377559654764e-06,
"logits/chosen": -4.202296733856201,
"logits/rejected": -4.219254970550537,
"logps/chosen": -789.4403076171875,
"logps/rejected": -508.3605041503906,
"loss": 0.6121,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.21312782168388367,
"rewards/margins": 0.21922513842582703,
"rewards/rejected": -0.006097340490669012,
"step": 3625
},
{
"epoch": 0.11983321842476773,
"grad_norm": 106.0,
"learning_rate": 4.537146725334236e-06,
"logits/chosen": -4.121474742889404,
"logits/rejected": -4.223160266876221,
"logps/chosen": -642.5357666015625,
"logps/rejected": -391.467041015625,
"loss": 0.6079,
"rewards/accuracies": 0.7400000095367432,
"rewards/chosen": 0.1511656790971756,
"rewards/margins": 0.20676733553409576,
"rewards/rejected": -0.05560165271162987,
"step": 3650
},
{
"epoch": 0.12065399389343051,
"grad_norm": 143.0,
"learning_rate": 4.532915891013708e-06,
"logits/chosen": -3.9525647163391113,
"logits/rejected": -4.0206708908081055,
"logps/chosen": -648.7009887695312,
"logps/rejected": -420.0541687011719,
"loss": 0.6235,
"rewards/accuracies": 0.7200000286102295,
"rewards/chosen": 0.11896297335624695,
"rewards/margins": 0.18399059772491455,
"rewards/rejected": -0.06502760946750641,
"step": 3675
},
{
"epoch": 0.12147476936209331,
"grad_norm": 117.0,
"learning_rate": 4.52868505669318e-06,
"logits/chosen": -4.178898334503174,
"logits/rejected": -4.247806072235107,
"logps/chosen": -669.1397094726562,
"logps/rejected": -537.1188354492188,
"loss": 0.5979,
"rewards/accuracies": 0.8399999737739563,
"rewards/chosen": 0.11097316443920135,
"rewards/margins": 0.24359607696533203,
"rewards/rejected": -0.13262291252613068,
"step": 3700
},
{
"epoch": 0.1222955448307561,
"grad_norm": 258.0,
"learning_rate": 4.524454222372652e-06,
"logits/chosen": -4.230034828186035,
"logits/rejected": -4.431790351867676,
"logps/chosen": -781.9459228515625,
"logps/rejected": -604.2550659179688,
"loss": 0.6705,
"rewards/accuracies": 0.6800000071525574,
"rewards/chosen": 0.02702181786298752,
"rewards/margins": 0.08671737462282181,
"rewards/rejected": -0.059695564210414886,
"step": 3725
},
{
"epoch": 0.1231163202994189,
"grad_norm": 92.5,
"learning_rate": 4.520223388052124e-06,
"logits/chosen": -4.086462497711182,
"logits/rejected": -4.117921829223633,
"logps/chosen": -670.205810546875,
"logps/rejected": -539.9747924804688,
"loss": 0.6525,
"rewards/accuracies": 0.6399999856948853,
"rewards/chosen": 0.1253843605518341,
"rewards/margins": 0.12610657513141632,
"rewards/rejected": -0.0007222199346870184,
"step": 3750
},
{
"epoch": 0.12393709576808168,
"grad_norm": 163.0,
"learning_rate": 4.515992553731596e-06,
"logits/chosen": -4.041085720062256,
"logits/rejected": -4.055666446685791,
"logps/chosen": -655.8377075195312,
"logps/rejected": -762.2373657226562,
"loss": 0.612,
"rewards/accuracies": 0.7200000286102295,
"rewards/chosen": 0.10744628310203552,
"rewards/margins": 0.4711156487464905,
"rewards/rejected": -0.3636693060398102,
"step": 3775
},
{
"epoch": 0.12475787123674448,
"grad_norm": 130.0,
"learning_rate": 4.511761719411068e-06,
"logits/chosen": -3.9867475032806396,
"logits/rejected": -4.021495819091797,
"logps/chosen": -705.5885009765625,
"logps/rejected": -611.0843505859375,
"loss": 0.6303,
"rewards/accuracies": 0.6600000262260437,
"rewards/chosen": 0.12187641113996506,
"rewards/margins": 0.16648361086845398,
"rewards/rejected": -0.04460718482732773,
"step": 3800
},
{
"epoch": 0.12557864670540728,
"grad_norm": 142.0,
"learning_rate": 4.50753088509054e-06,
"logits/chosen": -4.264425754547119,
"logits/rejected": -4.1908040046691895,
"logps/chosen": -770.77294921875,
"logps/rejected": -574.1017456054688,
"loss": 0.6508,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.1506727933883667,
"rewards/margins": 0.17019566893577576,
"rewards/rejected": -0.019522881135344505,
"step": 3825
},
{
"epoch": 0.12639942217407008,
"grad_norm": 150.0,
"learning_rate": 4.503300050770012e-06,
"logits/chosen": -4.039714813232422,
"logits/rejected": -4.125188827514648,
"logps/chosen": -787.6677856445312,
"logps/rejected": -570.5169067382812,
"loss": 0.6468,
"rewards/accuracies": 0.5600000023841858,
"rewards/chosen": 0.1255396604537964,
"rewards/margins": 0.12908677756786346,
"rewards/rejected": -0.0035471057053655386,
"step": 3850
},
{
"epoch": 0.12722019764273285,
"grad_norm": 225.0,
"learning_rate": 4.499069216449484e-06,
"logits/chosen": -4.007396697998047,
"logits/rejected": -3.876781702041626,
"logps/chosen": -626.9278564453125,
"logps/rejected": -422.06170654296875,
"loss": 0.6722,
"rewards/accuracies": 0.5799999833106995,
"rewards/chosen": 0.06111162155866623,
"rewards/margins": 0.07888734340667725,
"rewards/rejected": -0.017775723710656166,
"step": 3875
},
{
"epoch": 0.12804097311139565,
"grad_norm": 354.0,
"learning_rate": 4.494838382128956e-06,
"logits/chosen": -4.007481575012207,
"logits/rejected": -3.866903781890869,
"logps/chosen": -805.19921875,
"logps/rejected": -565.35107421875,
"loss": 0.6381,
"rewards/accuracies": 0.6800000071525574,
"rewards/chosen": 0.1270570307970047,
"rewards/margins": 0.1453864425420761,
"rewards/rejected": -0.018329383805394173,
"step": 3900
},
{
"epoch": 0.12886174858005844,
"grad_norm": 151.0,
"learning_rate": 4.490607547808428e-06,
"logits/chosen": -4.091001987457275,
"logits/rejected": -4.0446929931640625,
"logps/chosen": -804.8853759765625,
"logps/rejected": -614.6016845703125,
"loss": 0.6234,
"rewards/accuracies": 0.6600000262260437,
"rewards/chosen": 0.12878599762916565,
"rewards/margins": 0.19198790192604065,
"rewards/rejected": -0.0632018968462944,
"step": 3925
},
{
"epoch": 0.12968252404872124,
"grad_norm": 89.0,
"learning_rate": 4.4863767134879e-06,
"logits/chosen": -4.09019660949707,
"logits/rejected": -4.269530773162842,
"logps/chosen": -662.8995971679688,
"logps/rejected": -482.9696044921875,
"loss": 0.6567,
"rewards/accuracies": 0.5799999833106995,
"rewards/chosen": 0.08450495451688766,
"rewards/margins": 0.10292019695043564,
"rewards/rejected": -0.018415246158838272,
"step": 3950
},
{
"epoch": 0.130503299517384,
"grad_norm": 352.0,
"learning_rate": 4.482145879167372e-06,
"logits/chosen": -4.184777736663818,
"logits/rejected": -4.207626819610596,
"logps/chosen": -727.8124389648438,
"logps/rejected": -498.6629638671875,
"loss": 0.6473,
"rewards/accuracies": 0.6399999856948853,
"rewards/chosen": 0.17035777866840363,
"rewards/margins": 0.12111759930849075,
"rewards/rejected": 0.04924018308520317,
"step": 3975
},
{
"epoch": 0.1313240749860468,
"grad_norm": 220.0,
"learning_rate": 4.477915044846844e-06,
"logits/chosen": -4.153555393218994,
"logits/rejected": -4.249951362609863,
"logps/chosen": -749.2251586914062,
"logps/rejected": -574.6797485351562,
"loss": 0.6228,
"rewards/accuracies": 0.7599999904632568,
"rewards/chosen": 0.14904262125492096,
"rewards/margins": 0.18919992446899414,
"rewards/rejected": -0.04015731438994408,
"step": 4000
},
{
"epoch": 0.1321448504547096,
"grad_norm": 116.5,
"learning_rate": 4.473684210526316e-06,
"logits/chosen": -4.1704230308532715,
"logits/rejected": -4.231996536254883,
"logps/chosen": -990.6297607421875,
"logps/rejected": -666.6671752929688,
"loss": 0.6352,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.1348913311958313,
"rewards/margins": 0.17748390138149261,
"rewards/rejected": -0.04259258881211281,
"step": 4025
},
{
"epoch": 0.1329656259233724,
"grad_norm": 450.0,
"learning_rate": 4.469453376205788e-06,
"logits/chosen": -4.276303291320801,
"logits/rejected": -4.25356388092041,
"logps/chosen": -855.682373046875,
"logps/rejected": -677.4093627929688,
"loss": 0.6157,
"rewards/accuracies": 0.7400000095367432,
"rewards/chosen": 0.14246423542499542,
"rewards/margins": 0.2275465726852417,
"rewards/rejected": -0.08508235216140747,
"step": 4050
},
{
"epoch": 0.13378640139203518,
"grad_norm": 209.0,
"learning_rate": 4.46522254188526e-06,
"logits/chosen": -4.336771488189697,
"logits/rejected": -4.413280963897705,
"logps/chosen": -787.864501953125,
"logps/rejected": -556.9666748046875,
"loss": 0.6387,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.1050407811999321,
"rewards/margins": 0.16268308460712433,
"rewards/rejected": -0.05764232203364372,
"step": 4075
},
{
"epoch": 0.13460717686069798,
"grad_norm": 129.0,
"learning_rate": 4.460991707564732e-06,
"logits/chosen": -4.026488304138184,
"logits/rejected": -4.244543552398682,
"logps/chosen": -593.8565063476562,
"logps/rejected": -494.901611328125,
"loss": 0.6009,
"rewards/accuracies": 0.7599999904632568,
"rewards/chosen": 0.05559429153800011,
"rewards/margins": 0.2449643760919571,
"rewards/rejected": -0.1893700361251831,
"step": 4100
},
{
"epoch": 0.13542795232936078,
"grad_norm": 126.5,
"learning_rate": 4.456760873244204e-06,
"logits/chosen": -4.122188091278076,
"logits/rejected": -4.109318733215332,
"logps/chosen": -474.5272521972656,
"logps/rejected": -401.0484619140625,
"loss": 0.6556,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.09522067755460739,
"rewards/margins": 0.10534299910068512,
"rewards/rejected": -0.010122320614755154,
"step": 4125
},
{
"epoch": 0.13624872779802358,
"grad_norm": 170.0,
"learning_rate": 4.452530038923676e-06,
"logits/chosen": -4.061251640319824,
"logits/rejected": -4.103055000305176,
"logps/chosen": -752.6796875,
"logps/rejected": -524.8905029296875,
"loss": 0.6571,
"rewards/accuracies": 0.6399999856948853,
"rewards/chosen": 0.09011424332857132,
"rewards/margins": 0.11959852278232574,
"rewards/rejected": -0.029484273865818977,
"step": 4150
},
{
"epoch": 0.13706950326668638,
"grad_norm": 174.0,
"learning_rate": 4.448299204603148e-06,
"logits/chosen": -4.170642375946045,
"logits/rejected": -4.242205619812012,
"logps/chosen": -837.6634521484375,
"logps/rejected": -575.63525390625,
"loss": 0.6356,
"rewards/accuracies": 0.6600000262260437,
"rewards/chosen": 0.1658136397600174,
"rewards/margins": 0.16742119193077087,
"rewards/rejected": -0.0016075682360678911,
"step": 4175
},
{
"epoch": 0.13789027873534915,
"grad_norm": 390.0,
"learning_rate": 4.44406837028262e-06,
"logits/chosen": -4.180340766906738,
"logits/rejected": -4.24651575088501,
"logps/chosen": -639.0101318359375,
"logps/rejected": -509.2608642578125,
"loss": 0.6631,
"rewards/accuracies": 0.6399999856948853,
"rewards/chosen": 0.05516006052494049,
"rewards/margins": 0.1022617444396019,
"rewards/rejected": -0.047101687639951706,
"step": 4200
},
{
"epoch": 0.13871105420401195,
"grad_norm": 119.5,
"learning_rate": 4.439837535962092e-06,
"logits/chosen": -4.179917335510254,
"logits/rejected": -4.300662994384766,
"logps/chosen": -896.769775390625,
"logps/rejected": -689.7025756835938,
"loss": 0.6706,
"rewards/accuracies": 0.5799999833106995,
"rewards/chosen": 0.12181320041418076,
"rewards/margins": 0.09058444947004318,
"rewards/rejected": 0.031228771433234215,
"step": 4225
},
{
"epoch": 0.13953182967267475,
"grad_norm": 207.0,
"learning_rate": 4.435606701641564e-06,
"logits/chosen": -4.1937360763549805,
"logits/rejected": -4.245815277099609,
"logps/chosen": -621.9994506835938,
"logps/rejected": -518.42724609375,
"loss": 0.6725,
"rewards/accuracies": 0.6600000262260437,
"rewards/chosen": 0.1049574688076973,
"rewards/margins": 0.07572697848081589,
"rewards/rejected": 0.02923049032688141,
"step": 4250
},
{
"epoch": 0.14035260514133754,
"grad_norm": 129.0,
"learning_rate": 4.431375867321036e-06,
"logits/chosen": -4.198615550994873,
"logits/rejected": -4.119570255279541,
"logps/chosen": -573.978271484375,
"logps/rejected": -433.1485595703125,
"loss": 0.6443,
"rewards/accuracies": 0.6800000071525574,
"rewards/chosen": 0.0844545066356659,
"rewards/margins": 0.14824089407920837,
"rewards/rejected": -0.06378639489412308,
"step": 4275
},
{
"epoch": 0.14117338061000032,
"grad_norm": 1808.0,
"learning_rate": 4.427145033000508e-06,
"logits/chosen": -3.9868874549865723,
"logits/rejected": -4.086323261260986,
"logps/chosen": -702.60009765625,
"logps/rejected": -433.14678955078125,
"loss": 0.6546,
"rewards/accuracies": 0.6600000262260437,
"rewards/chosen": 0.11101648211479187,
"rewards/margins": 0.12218675017356873,
"rewards/rejected": -0.01117025688290596,
"step": 4300
},
{
"epoch": 0.14199415607866311,
"grad_norm": 165.0,
"learning_rate": 4.42291419867998e-06,
"logits/chosen": -4.045719146728516,
"logits/rejected": -4.074853897094727,
"logps/chosen": -693.7863159179688,
"logps/rejected": -559.730224609375,
"loss": 0.6367,
"rewards/accuracies": 0.6800000071525574,
"rewards/chosen": 0.14746950566768646,
"rewards/margins": 0.15490588545799255,
"rewards/rejected": -0.007436369080096483,
"step": 4325
},
{
"epoch": 0.1428149315473259,
"grad_norm": 174.0,
"learning_rate": 4.418683364359452e-06,
"logits/chosen": -4.222276210784912,
"logits/rejected": -4.294714450836182,
"logps/chosen": -605.540771484375,
"logps/rejected": -488.9421081542969,
"loss": 0.6346,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.10411551594734192,
"rewards/margins": 0.14872083067893982,
"rewards/rejected": -0.0446053072810173,
"step": 4350
},
{
"epoch": 0.1436357070159887,
"grad_norm": 88.0,
"learning_rate": 4.414452530038924e-06,
"logits/chosen": -4.0823750495910645,
"logits/rejected": -4.026907920837402,
"logps/chosen": -584.8992919921875,
"logps/rejected": -504.4686279296875,
"loss": 0.6124,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.1442980170249939,
"rewards/margins": 0.2002059370279312,
"rewards/rejected": -0.055907901376485825,
"step": 4375
},
{
"epoch": 0.1444564824846515,
"grad_norm": 226.0,
"learning_rate": 4.410221695718396e-06,
"logits/chosen": -4.218522548675537,
"logits/rejected": -4.213830947875977,
"logps/chosen": -909.9868774414062,
"logps/rejected": -584.3450927734375,
"loss": 0.6128,
"rewards/accuracies": 0.7200000286102295,
"rewards/chosen": 0.19589273631572723,
"rewards/margins": 0.2196936011314392,
"rewards/rejected": -0.023800864815711975,
"step": 4400
},
{
"epoch": 0.14527725795331428,
"grad_norm": 212.0,
"learning_rate": 4.405990861397868e-06,
"logits/chosen": -4.175970077514648,
"logits/rejected": -4.285007953643799,
"logps/chosen": -666.7886962890625,
"logps/rejected": -492.61016845703125,
"loss": 0.6155,
"rewards/accuracies": 0.6600000262260437,
"rewards/chosen": 0.1311405748128891,
"rewards/margins": 0.20073936879634857,
"rewards/rejected": -0.06959877908229828,
"step": 4425
},
{
"epoch": 0.14609803342197708,
"grad_norm": 298.0,
"learning_rate": 4.40176002707734e-06,
"logits/chosen": -4.091942310333252,
"logits/rejected": -4.163181304931641,
"logps/chosen": -770.0662231445312,
"logps/rejected": -469.1553955078125,
"loss": 0.6015,
"rewards/accuracies": 0.6800000071525574,
"rewards/chosen": 0.19774757325649261,
"rewards/margins": 0.2411683201789856,
"rewards/rejected": -0.04342072829604149,
"step": 4450
},
{
"epoch": 0.14691880889063988,
"grad_norm": 194.0,
"learning_rate": 4.397529192756812e-06,
"logits/chosen": -4.143798351287842,
"logits/rejected": -4.189226150512695,
"logps/chosen": -596.7088012695312,
"logps/rejected": -398.5277404785156,
"loss": 0.6588,
"rewards/accuracies": 0.6600000262260437,
"rewards/chosen": 0.06982951611280441,
"rewards/margins": 0.12531226873397827,
"rewards/rejected": -0.05548277497291565,
"step": 4475
},
{
"epoch": 0.14773958435930268,
"grad_norm": 218.0,
"learning_rate": 4.393298358436284e-06,
"logits/chosen": -4.239335060119629,
"logits/rejected": -4.231566905975342,
"logps/chosen": -792.1026000976562,
"logps/rejected": -575.8635864257812,
"loss": 0.6145,
"rewards/accuracies": 0.6800000071525574,
"rewards/chosen": 0.17430633306503296,
"rewards/margins": 0.2026720494031906,
"rewards/rejected": -0.028365688398480415,
"step": 4500
},
{
"epoch": 0.14856035982796545,
"grad_norm": 161.0,
"learning_rate": 4.389067524115756e-06,
"logits/chosen": -4.187715530395508,
"logits/rejected": -4.225770950317383,
"logps/chosen": -643.482421875,
"logps/rejected": -506.84161376953125,
"loss": 0.6411,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.0837879404425621,
"rewards/margins": 0.16951730847358704,
"rewards/rejected": -0.08572938293218613,
"step": 4525
},
{
"epoch": 0.14938113529662825,
"grad_norm": 160.0,
"learning_rate": 4.384836689795228e-06,
"logits/chosen": -4.2231764793396,
"logits/rejected": -4.232290744781494,
"logps/chosen": -607.963134765625,
"logps/rejected": -468.76544189453125,
"loss": 0.641,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.14704573154449463,
"rewards/margins": 0.1499151736497879,
"rewards/rejected": -0.002869446761906147,
"step": 4550
},
{
"epoch": 0.15020191076529105,
"grad_norm": 161.0,
"learning_rate": 4.3806058554747e-06,
"logits/chosen": -4.201754093170166,
"logits/rejected": -4.216127872467041,
"logps/chosen": -621.528076171875,
"logps/rejected": -426.3268737792969,
"loss": 0.6394,
"rewards/accuracies": 0.5600000023841858,
"rewards/chosen": 0.12953585386276245,
"rewards/margins": 0.1462259441614151,
"rewards/rejected": -0.016690107062458992,
"step": 4575
},
{
"epoch": 0.15102268623395385,
"grad_norm": 206.0,
"learning_rate": 4.376375021154172e-06,
"logits/chosen": -4.243280410766602,
"logits/rejected": -4.3009724617004395,
"logps/chosen": -822.161865234375,
"logps/rejected": -618.0408325195312,
"loss": 0.6202,
"rewards/accuracies": 0.6399999856948853,
"rewards/chosen": 0.21996277570724487,
"rewards/margins": 0.2018716037273407,
"rewards/rejected": 0.018091170117259026,
"step": 4600
},
{
"epoch": 0.15184346170261664,
"grad_norm": 62.75,
"learning_rate": 4.372144186833644e-06,
"logits/chosen": -4.265949249267578,
"logits/rejected": -4.3107805252075195,
"logps/chosen": -684.9762573242188,
"logps/rejected": -497.3363952636719,
"loss": 0.6175,
"rewards/accuracies": 0.6399999856948853,
"rewards/chosen": 0.21863166987895966,
"rewards/margins": 0.20082683861255646,
"rewards/rejected": 0.017804812639951706,
"step": 4625
},
{
"epoch": 0.15266423717127942,
"grad_norm": 156.0,
"learning_rate": 4.367913352513116e-06,
"logits/chosen": -4.131052494049072,
"logits/rejected": -4.288494110107422,
"logps/chosen": -714.316650390625,
"logps/rejected": -461.1311340332031,
"loss": 0.5934,
"rewards/accuracies": 0.7200000286102295,
"rewards/chosen": 0.2658103108406067,
"rewards/margins": 0.2636970579624176,
"rewards/rejected": 0.00211325753480196,
"step": 4650
},
{
"epoch": 0.15348501263994221,
"grad_norm": 191.0,
"learning_rate": 4.363682518192588e-06,
"logits/chosen": -4.270446300506592,
"logits/rejected": -4.1907639503479,
"logps/chosen": -814.872314453125,
"logps/rejected": -580.6480102539062,
"loss": 0.648,
"rewards/accuracies": 0.6600000262260437,
"rewards/chosen": 0.1529182642698288,
"rewards/margins": 0.13565631210803986,
"rewards/rejected": 0.017261944711208344,
"step": 4675
},
{
"epoch": 0.154305788108605,
"grad_norm": 332.0,
"learning_rate": 4.35945168387206e-06,
"logits/chosen": -4.242652893066406,
"logits/rejected": -4.241410255432129,
"logps/chosen": -663.955322265625,
"logps/rejected": -559.5843505859375,
"loss": 0.6485,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.10647504031658173,
"rewards/margins": 0.13156315684318542,
"rewards/rejected": -0.025088123977184296,
"step": 4700
},
{
"epoch": 0.1551265635772678,
"grad_norm": 138.0,
"learning_rate": 4.355220849551532e-06,
"logits/chosen": -3.9757704734802246,
"logits/rejected": -3.965700149536133,
"logps/chosen": -679.7207641601562,
"logps/rejected": -476.59259033203125,
"loss": 0.6664,
"rewards/accuracies": 0.5400000214576721,
"rewards/chosen": 0.15955568850040436,
"rewards/margins": 0.10310190171003342,
"rewards/rejected": 0.05645379424095154,
"step": 4725
},
{
"epoch": 0.15594733904593058,
"grad_norm": 106.5,
"learning_rate": 4.350990015231004e-06,
"logits/chosen": -4.21544075012207,
"logits/rejected": -4.245710372924805,
"logps/chosen": -1018.4263305664062,
"logps/rejected": -763.0903930664062,
"loss": 0.614,
"rewards/accuracies": 0.7400000095367432,
"rewards/chosen": 0.23744136095046997,
"rewards/margins": 0.22951194643974304,
"rewards/rejected": 0.007929441519081593,
"step": 4750
},
{
"epoch": 0.15676811451459338,
"grad_norm": 170.0,
"learning_rate": 4.346759180910476e-06,
"logits/chosen": -4.2572550773620605,
"logits/rejected": -4.290678977966309,
"logps/chosen": -785.2484130859375,
"logps/rejected": -493.4697570800781,
"loss": 0.6116,
"rewards/accuracies": 0.6600000262260437,
"rewards/chosen": 0.1742425113916397,
"rewards/margins": 0.20866689085960388,
"rewards/rejected": -0.03442436829209328,
"step": 4775
},
{
"epoch": 0.15758888998325618,
"grad_norm": 168.0,
"learning_rate": 4.342528346589948e-06,
"logits/chosen": -4.143786430358887,
"logits/rejected": -4.080834865570068,
"logps/chosen": -737.3786010742188,
"logps/rejected": -521.0652465820312,
"loss": 0.5969,
"rewards/accuracies": 0.7799999713897705,
"rewards/chosen": 0.19123902916908264,
"rewards/margins": 0.24138489365577698,
"rewards/rejected": -0.05014587566256523,
"step": 4800
},
{
"epoch": 0.15840966545191898,
"grad_norm": 190.0,
"learning_rate": 4.33829751226942e-06,
"logits/chosen": -4.198343753814697,
"logits/rejected": -4.262630462646484,
"logps/chosen": -691.4227905273438,
"logps/rejected": -543.6026611328125,
"loss": 0.6123,
"rewards/accuracies": 0.7400000095367432,
"rewards/chosen": 0.1909123808145523,
"rewards/margins": 0.20675772428512573,
"rewards/rejected": -0.015845321118831635,
"step": 4825
},
{
"epoch": 0.15923044092058178,
"grad_norm": 133.0,
"learning_rate": 4.334066677948892e-06,
"logits/chosen": -4.119637489318848,
"logits/rejected": -4.084893226623535,
"logps/chosen": -581.482666015625,
"logps/rejected": -442.5643615722656,
"loss": 0.5791,
"rewards/accuracies": 0.7400000095367432,
"rewards/chosen": 0.21446186304092407,
"rewards/margins": 0.31452712416648865,
"rewards/rejected": -0.10006527602672577,
"step": 4850
},
{
"epoch": 0.16005121638924455,
"grad_norm": 112.5,
"learning_rate": 4.329835843628364e-06,
"logits/chosen": -4.266894340515137,
"logits/rejected": -4.300928115844727,
"logps/chosen": -833.2391967773438,
"logps/rejected": -558.8040771484375,
"loss": 0.667,
"rewards/accuracies": 0.5799999833106995,
"rewards/chosen": 0.10664394497871399,
"rewards/margins": 0.12200988829135895,
"rewards/rejected": -0.01536593772470951,
"step": 4875
},
{
"epoch": 0.16087199185790735,
"grad_norm": 153.0,
"learning_rate": 4.325605009307836e-06,
"logits/chosen": -4.04871940612793,
"logits/rejected": -4.289731025695801,
"logps/chosen": -695.7615356445312,
"logps/rejected": -539.9432983398438,
"loss": 0.5951,
"rewards/accuracies": 0.7400000095367432,
"rewards/chosen": 0.20514139533042908,
"rewards/margins": 0.2501979470252991,
"rewards/rejected": -0.0450565330684185,
"step": 4900
},
{
"epoch": 0.16169276732657015,
"grad_norm": 130.0,
"learning_rate": 4.321374174987308e-06,
"logits/chosen": -4.080851078033447,
"logits/rejected": -4.139771938323975,
"logps/chosen": -775.0393676757812,
"logps/rejected": -510.61358642578125,
"loss": 0.6145,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.17780721187591553,
"rewards/margins": 0.22440527379512787,
"rewards/rejected": -0.04659804329276085,
"step": 4925
},
{
"epoch": 0.16251354279523295,
"grad_norm": 130.0,
"learning_rate": 4.31714334066678e-06,
"logits/chosen": -4.235746383666992,
"logits/rejected": -4.229293346405029,
"logps/chosen": -632.58154296875,
"logps/rejected": -580.3175048828125,
"loss": 0.6374,
"rewards/accuracies": 0.6800000071525574,
"rewards/chosen": 0.1310328096151352,
"rewards/margins": 0.16347967088222504,
"rewards/rejected": -0.032446879893541336,
"step": 4950
},
{
"epoch": 0.16333431826389572,
"grad_norm": 320.0,
"learning_rate": 4.312912506346252e-06,
"logits/chosen": -3.9886889457702637,
"logits/rejected": -4.151256084442139,
"logps/chosen": -638.684814453125,
"logps/rejected": -490.31024169921875,
"loss": 0.6275,
"rewards/accuracies": 0.6600000262260437,
"rewards/chosen": 0.0955779105424881,
"rewards/margins": 0.19564473628997803,
"rewards/rejected": -0.10006682574748993,
"step": 4975
},
{
"epoch": 0.16415509373255852,
"grad_norm": 228.0,
"learning_rate": 4.308681672025724e-06,
"logits/chosen": -4.112755298614502,
"logits/rejected": -4.220064640045166,
"logps/chosen": -815.4720458984375,
"logps/rejected": -582.703369140625,
"loss": 0.6494,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.08248654007911682,
"rewards/margins": 0.1237930878996849,
"rewards/rejected": -0.041306547820568085,
"step": 5000
},
{
"epoch": 0.16497586920122131,
"grad_norm": 362.0,
"learning_rate": 4.304450837705196e-06,
"logits/chosen": -4.267681121826172,
"logits/rejected": -4.1627960205078125,
"logps/chosen": -826.8048706054688,
"logps/rejected": -580.4628295898438,
"loss": 0.6022,
"rewards/accuracies": 0.7599999904632568,
"rewards/chosen": 0.17233513295650482,
"rewards/margins": 0.23386350274085999,
"rewards/rejected": -0.06152837723493576,
"step": 5025
},
{
"epoch": 0.1657966446698841,
"grad_norm": 280.0,
"learning_rate": 4.300220003384668e-06,
"logits/chosen": -4.138628005981445,
"logits/rejected": -4.228004455566406,
"logps/chosen": -595.0879516601562,
"logps/rejected": -410.44061279296875,
"loss": 0.6298,
"rewards/accuracies": 0.6200000047683716,
"rewards/chosen": 0.1664738804101944,
"rewards/margins": 0.1815992295742035,
"rewards/rejected": -0.015125354751944542,
"step": 5050
},
{
"epoch": 0.1666174201385469,
"grad_norm": 175.0,
"learning_rate": 4.29598916906414e-06,
"logits/chosen": -4.243340969085693,
"logits/rejected": -4.193829536437988,
"logps/chosen": -670.334228515625,
"logps/rejected": -474.9371032714844,
"loss": 0.6345,
"rewards/accuracies": 0.6600000262260437,
"rewards/chosen": 0.16480280458927155,
"rewards/margins": 0.16151557862758636,
"rewards/rejected": 0.0032872117590159178,
"step": 5075
},
{
"epoch": 0.16743819560720968,
"grad_norm": 191.0,
"learning_rate": 4.291758334743612e-06,
"logits/chosen": -4.132363319396973,
"logits/rejected": -4.141480922698975,
"logps/chosen": -799.2957763671875,
"logps/rejected": -670.20849609375,
"loss": 0.6597,
"rewards/accuracies": 0.6200000047683716,
"rewards/chosen": 0.13164381682872772,
"rewards/margins": 0.11912553012371063,
"rewards/rejected": 0.012518273666501045,
"step": 5100
},
{
"epoch": 0.16825897107587248,
"grad_norm": 192.0,
"learning_rate": 4.287527500423084e-06,
"logits/chosen": -4.019545078277588,
"logits/rejected": -4.000518321990967,
"logps/chosen": -720.95947265625,
"logps/rejected": -593.3463134765625,
"loss": 0.657,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.12281408905982971,
"rewards/margins": 0.5138216614723206,
"rewards/rejected": -0.39100757241249084,
"step": 5125
},
{
"epoch": 0.16907974654453528,
"grad_norm": 208.0,
"learning_rate": 4.283296666102556e-06,
"logits/chosen": -4.264939785003662,
"logits/rejected": -4.2540388107299805,
"logps/chosen": -605.2987670898438,
"logps/rejected": -425.5446472167969,
"loss": 0.6227,
"rewards/accuracies": 0.6800000071525574,
"rewards/chosen": 0.1905955821275711,
"rewards/margins": 0.18971359729766846,
"rewards/rejected": 0.00088197470176965,
"step": 5150
},
{
"epoch": 0.16990052201319808,
"grad_norm": 143.0,
"learning_rate": 4.279065831782028e-06,
"logits/chosen": -4.044861793518066,
"logits/rejected": -4.183505058288574,
"logps/chosen": -620.2490234375,
"logps/rejected": -478.9082336425781,
"loss": 0.5998,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.18604780733585358,
"rewards/margins": 0.23399414122104645,
"rewards/rejected": -0.04794633388519287,
"step": 5175
},
{
"epoch": 0.17072129748186085,
"grad_norm": 256.0,
"learning_rate": 4.2748349974615e-06,
"logits/chosen": -4.1650519371032715,
"logits/rejected": -4.288421154022217,
"logps/chosen": -782.1646728515625,
"logps/rejected": -516.1348876953125,
"loss": 0.6416,
"rewards/accuracies": 0.6399999856948853,
"rewards/chosen": 0.14982113242149353,
"rewards/margins": 0.17511123418807983,
"rewards/rejected": -0.02529011480510235,
"step": 5200
},
{
"epoch": 0.17154207295052365,
"grad_norm": 226.0,
"learning_rate": 4.270604163140972e-06,
"logits/chosen": -4.2230544090271,
"logits/rejected": -4.363689422607422,
"logps/chosen": -932.8560791015625,
"logps/rejected": -670.7036743164062,
"loss": 0.5626,
"rewards/accuracies": 0.7599999904632568,
"rewards/chosen": 0.21535824239253998,
"rewards/margins": 0.3550167977809906,
"rewards/rejected": -0.13965855538845062,
"step": 5225
},
{
"epoch": 0.17236284841918645,
"grad_norm": 89.5,
"learning_rate": 4.266373328820444e-06,
"logits/chosen": -4.2306623458862305,
"logits/rejected": -4.218886375427246,
"logps/chosen": -593.7825927734375,
"logps/rejected": -460.80255126953125,
"loss": 0.6256,
"rewards/accuracies": 0.6600000262260437,
"rewards/chosen": 0.24924713373184204,
"rewards/margins": 0.19636379182338715,
"rewards/rejected": 0.05288334935903549,
"step": 5250
},
{
"epoch": 0.17318362388784925,
"grad_norm": 104.5,
"learning_rate": 4.262142494499916e-06,
"logits/chosen": -4.100011825561523,
"logits/rejected": -4.121422290802002,
"logps/chosen": -763.5194702148438,
"logps/rejected": -460.8433532714844,
"loss": 0.5805,
"rewards/accuracies": 0.7400000095367432,
"rewards/chosen": 0.30740201473236084,
"rewards/margins": 0.29280537366867065,
"rewards/rejected": 0.014596661552786827,
"step": 5275
},
{
"epoch": 0.17400439935651205,
"grad_norm": 145.0,
"learning_rate": 4.257911660179388e-06,
"logits/chosen": -4.0073347091674805,
"logits/rejected": -4.158247470855713,
"logps/chosen": -669.4347534179688,
"logps/rejected": -583.874755859375,
"loss": 0.6597,
"rewards/accuracies": 0.6600000262260437,
"rewards/chosen": 0.09481468796730042,
"rewards/margins": 0.12035001069307327,
"rewards/rejected": -0.025535322725772858,
"step": 5300
},
{
"epoch": 0.17482517482517482,
"grad_norm": 194.0,
"learning_rate": 4.25368082585886e-06,
"logits/chosen": -4.175117015838623,
"logits/rejected": -4.1560378074646,
"logps/chosen": -516.7086181640625,
"logps/rejected": -377.07342529296875,
"loss": 0.6758,
"rewards/accuracies": 0.5799999833106995,
"rewards/chosen": 0.13141313195228577,
"rewards/margins": 0.06963927298784256,
"rewards/rejected": 0.061773862689733505,
"step": 5325
},
{
"epoch": 0.17564595029383762,
"grad_norm": 228.0,
"learning_rate": 4.249449991538332e-06,
"logits/chosen": -4.131567478179932,
"logits/rejected": -4.205038070678711,
"logps/chosen": -705.3861083984375,
"logps/rejected": -533.5681762695312,
"loss": 0.641,
"rewards/accuracies": 0.5600000023841858,
"rewards/chosen": 0.16328547894954681,
"rewards/margins": 0.18053361773490906,
"rewards/rejected": -0.017248129472136497,
"step": 5350
},
{
"epoch": 0.17646672576250041,
"grad_norm": 63.25,
"learning_rate": 4.245219157217804e-06,
"logits/chosen": -4.128468036651611,
"logits/rejected": -4.179846286773682,
"logps/chosen": -770.1663208007812,
"logps/rejected": -586.923828125,
"loss": 0.5994,
"rewards/accuracies": 0.6800000071525574,
"rewards/chosen": 0.1435445100069046,
"rewards/margins": 0.26513317227363586,
"rewards/rejected": -0.12158867716789246,
"step": 5375
},
{
"epoch": 0.1772875012311632,
"grad_norm": 172.0,
"learning_rate": 4.240988322897276e-06,
"logits/chosen": -4.156975269317627,
"logits/rejected": -4.22945499420166,
"logps/chosen": -607.2662353515625,
"logps/rejected": -553.4839477539062,
"loss": 0.5973,
"rewards/accuracies": 0.6600000262260437,
"rewards/chosen": 0.25185444951057434,
"rewards/margins": 0.28923726081848145,
"rewards/rejected": -0.037382807582616806,
"step": 5400
},
{
"epoch": 0.17810827669982598,
"grad_norm": 115.5,
"learning_rate": 4.236757488576748e-06,
"logits/chosen": -4.242987155914307,
"logits/rejected": -4.287976264953613,
"logps/chosen": -649.2179565429688,
"logps/rejected": -505.8639831542969,
"loss": 0.5919,
"rewards/accuracies": 0.7799999713897705,
"rewards/chosen": 0.19670113921165466,
"rewards/margins": 0.25432896614074707,
"rewards/rejected": -0.057627782225608826,
"step": 5425
},
{
"epoch": 0.17892905216848878,
"grad_norm": 215.0,
"learning_rate": 4.23252665425622e-06,
"logits/chosen": -4.028115272521973,
"logits/rejected": -4.092276096343994,
"logps/chosen": -829.40380859375,
"logps/rejected": -672.47509765625,
"loss": 0.6481,
"rewards/accuracies": 0.7200000286102295,
"rewards/chosen": 0.15024369955062866,
"rewards/margins": 0.1664939671754837,
"rewards/rejected": -0.016250288113951683,
"step": 5450
},
{
"epoch": 0.17974982763715158,
"grad_norm": 112.0,
"learning_rate": 4.228295819935692e-06,
"logits/chosen": -4.228298187255859,
"logits/rejected": -4.2013726234436035,
"logps/chosen": -787.6983032226562,
"logps/rejected": -453.38031005859375,
"loss": 0.5961,
"rewards/accuracies": 0.6600000262260437,
"rewards/chosen": 0.25117358565330505,
"rewards/margins": 0.26828765869140625,
"rewards/rejected": -0.017114050686359406,
"step": 5475
},
{
"epoch": 0.18057060310581438,
"grad_norm": 115.0,
"learning_rate": 4.224064985615164e-06,
"logits/chosen": -4.019930839538574,
"logits/rejected": -4.275580883026123,
"logps/chosen": -673.7694702148438,
"logps/rejected": -538.3352661132812,
"loss": 0.6338,
"rewards/accuracies": 0.6399999856948853,
"rewards/chosen": 0.1483411341905594,
"rewards/margins": 0.1611100435256958,
"rewards/rejected": -0.012768914923071861,
"step": 5500
},
{
"epoch": 0.18139137857447715,
"grad_norm": 129.0,
"learning_rate": 4.219834151294636e-06,
"logits/chosen": -4.229439735412598,
"logits/rejected": -4.16810417175293,
"logps/chosen": -900.92431640625,
"logps/rejected": -535.6353759765625,
"loss": 0.6067,
"rewards/accuracies": 0.6800000071525574,
"rewards/chosen": 0.23091623187065125,
"rewards/margins": 0.24078793823719025,
"rewards/rejected": -0.0098717100918293,
"step": 5525
},
{
"epoch": 0.18221215404313995,
"grad_norm": 169.0,
"learning_rate": 4.215603316974108e-06,
"logits/chosen": -4.243109226226807,
"logits/rejected": -4.321156024932861,
"logps/chosen": -867.3097534179688,
"logps/rejected": -529.377197265625,
"loss": 0.5628,
"rewards/accuracies": 0.8199999928474426,
"rewards/chosen": 0.20608556270599365,
"rewards/margins": 0.33511531352996826,
"rewards/rejected": -0.1290297657251358,
"step": 5550
},
{
"epoch": 0.18303292951180275,
"grad_norm": 163.0,
"learning_rate": 4.21137248265358e-06,
"logits/chosen": -4.261384963989258,
"logits/rejected": -4.396700859069824,
"logps/chosen": -738.622802734375,
"logps/rejected": -579.0276489257812,
"loss": 0.6155,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.1820848137140274,
"rewards/margins": 0.20051410794258118,
"rewards/rejected": -0.018429284915328026,
"step": 5575
},
{
"epoch": 0.18385370498046555,
"grad_norm": 298.0,
"learning_rate": 4.207141648333052e-06,
"logits/chosen": -4.233059883117676,
"logits/rejected": -4.272811412811279,
"logps/chosen": -814.3858642578125,
"logps/rejected": -547.3389282226562,
"loss": 0.6609,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.1449856013059616,
"rewards/margins": 0.10664605349302292,
"rewards/rejected": 0.038339558988809586,
"step": 5600
},
{
"epoch": 0.18467448044912835,
"grad_norm": 65.0,
"learning_rate": 4.202910814012524e-06,
"logits/chosen": -4.120992660522461,
"logits/rejected": -4.08679723739624,
"logps/chosen": -621.9240112304688,
"logps/rejected": -547.0635375976562,
"loss": 0.6141,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.16784168779850006,
"rewards/margins": 0.20409084856510162,
"rewards/rejected": -0.03624917194247246,
"step": 5625
},
{
"epoch": 0.18549525591779112,
"grad_norm": 163.0,
"learning_rate": 4.198679979691996e-06,
"logits/chosen": -4.074944019317627,
"logits/rejected": -4.333995819091797,
"logps/chosen": -645.5282592773438,
"logps/rejected": -481.385986328125,
"loss": 0.633,
"rewards/accuracies": 0.6200000047683716,
"rewards/chosen": 0.18368229269981384,
"rewards/margins": 0.20405273139476776,
"rewards/rejected": -0.020370442420244217,
"step": 5650
},
{
"epoch": 0.18631603138645392,
"grad_norm": 147.0,
"learning_rate": 4.194449145371468e-06,
"logits/chosen": -4.042386054992676,
"logits/rejected": -4.160777568817139,
"logps/chosen": -743.6102294921875,
"logps/rejected": -578.762939453125,
"loss": 0.6119,
"rewards/accuracies": 0.6800000071525574,
"rewards/chosen": 0.1839004009962082,
"rewards/margins": 0.23765355348587036,
"rewards/rejected": -0.05375315621495247,
"step": 5675
},
{
"epoch": 0.18713680685511672,
"grad_norm": 134.0,
"learning_rate": 4.19021831105094e-06,
"logits/chosen": -4.224156856536865,
"logits/rejected": -4.259792327880859,
"logps/chosen": -711.947998046875,
"logps/rejected": -538.19189453125,
"loss": 0.6145,
"rewards/accuracies": 0.6600000262260437,
"rewards/chosen": 0.22774983942508698,
"rewards/margins": 0.22662487626075745,
"rewards/rejected": 0.0011249757371842861,
"step": 5700
},
{
"epoch": 0.18795758232377952,
"grad_norm": 155.0,
"learning_rate": 4.185987476730412e-06,
"logits/chosen": -4.1899213790893555,
"logits/rejected": -4.298079967498779,
"logps/chosen": -790.821533203125,
"logps/rejected": -654.3777465820312,
"loss": 0.6277,
"rewards/accuracies": 0.6399999856948853,
"rewards/chosen": 0.13436265289783478,
"rewards/margins": 0.20195932686328888,
"rewards/rejected": -0.0675966665148735,
"step": 5725
},
{
"epoch": 0.1887783577924423,
"grad_norm": 210.0,
"learning_rate": 4.181756642409884e-06,
"logits/chosen": -4.21663236618042,
"logits/rejected": -4.2310566902160645,
"logps/chosen": -826.4243774414062,
"logps/rejected": -574.5404052734375,
"loss": 0.6265,
"rewards/accuracies": 0.6600000262260437,
"rewards/chosen": 0.14746469259262085,
"rewards/margins": 0.22628816962242126,
"rewards/rejected": -0.07882346212863922,
"step": 5750
},
{
"epoch": 0.18959913326110509,
"grad_norm": 149.0,
"learning_rate": 4.177525808089356e-06,
"logits/chosen": -4.072010040283203,
"logits/rejected": -4.1931257247924805,
"logps/chosen": -699.114501953125,
"logps/rejected": -505.4546203613281,
"loss": 0.5832,
"rewards/accuracies": 0.6800000071525574,
"rewards/chosen": 0.27559271454811096,
"rewards/margins": 0.302867591381073,
"rewards/rejected": -0.02727488987147808,
"step": 5775
},
{
"epoch": 0.19041990872976788,
"grad_norm": 108.0,
"learning_rate": 4.173294973768828e-06,
"logits/chosen": -4.217546463012695,
"logits/rejected": -4.097277641296387,
"logps/chosen": -739.192626953125,
"logps/rejected": -503.85296630859375,
"loss": 0.6646,
"rewards/accuracies": 0.5400000214576721,
"rewards/chosen": 0.1342460662126541,
"rewards/margins": 0.13998816907405853,
"rewards/rejected": -0.005742125678807497,
"step": 5800
},
{
"epoch": 0.19124068419843068,
"grad_norm": 88.0,
"learning_rate": 4.1690641394483e-06,
"logits/chosen": -4.054004669189453,
"logits/rejected": -4.206819534301758,
"logps/chosen": -619.5850219726562,
"logps/rejected": -404.0340576171875,
"loss": 0.612,
"rewards/accuracies": 0.6600000262260437,
"rewards/chosen": 0.1378345638513565,
"rewards/margins": 0.2363594025373459,
"rewards/rejected": -0.09852485358715057,
"step": 5825
},
{
"epoch": 0.19206145966709348,
"grad_norm": 302.0,
"learning_rate": 4.164833305127772e-06,
"logits/chosen": -4.187170505523682,
"logits/rejected": -4.270932674407959,
"logps/chosen": -751.2974853515625,
"logps/rejected": -424.2715759277344,
"loss": 0.6397,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.09154117852449417,
"rewards/margins": 0.16450461745262146,
"rewards/rejected": -0.07296343892812729,
"step": 5850
},
{
"epoch": 0.19288223513575625,
"grad_norm": 141.0,
"learning_rate": 4.160602470807243e-06,
"logits/chosen": -4.164458751678467,
"logits/rejected": -4.169947624206543,
"logps/chosen": -669.7885131835938,
"logps/rejected": -516.6512451171875,
"loss": 0.658,
"rewards/accuracies": 0.5199999809265137,
"rewards/chosen": 0.12925738096237183,
"rewards/margins": 0.11811258643865585,
"rewards/rejected": 0.011144790798425674,
"step": 5875
},
{
"epoch": 0.19370301060441905,
"grad_norm": 143.0,
"learning_rate": 4.156371636486716e-06,
"logits/chosen": -4.081195831298828,
"logits/rejected": -4.181396961212158,
"logps/chosen": -765.7492065429688,
"logps/rejected": -589.5405883789062,
"loss": 0.5968,
"rewards/accuracies": 0.6800000071525574,
"rewards/chosen": 0.18399088084697723,
"rewards/margins": 0.2514546513557434,
"rewards/rejected": -0.06746377795934677,
"step": 5900
},
{
"epoch": 0.19452378607308185,
"grad_norm": 159.0,
"learning_rate": 4.152140802166187e-06,
"logits/chosen": -4.238447666168213,
"logits/rejected": -4.124698638916016,
"logps/chosen": -767.7413940429688,
"logps/rejected": -659.5244750976562,
"loss": 0.6515,
"rewards/accuracies": 0.5600000023841858,
"rewards/chosen": 0.24860292673110962,
"rewards/margins": 0.1544342190027237,
"rewards/rejected": 0.09416870772838593,
"step": 5925
},
{
"epoch": 0.19534456154174465,
"grad_norm": 126.0,
"learning_rate": 4.14790996784566e-06,
"logits/chosen": -4.309987545013428,
"logits/rejected": -4.205745697021484,
"logps/chosen": -679.7860717773438,
"logps/rejected": -557.9547729492188,
"loss": 0.6518,
"rewards/accuracies": 0.6200000047683716,
"rewards/chosen": 0.19679342210292816,
"rewards/margins": 0.1271744966506958,
"rewards/rejected": 0.06961893290281296,
"step": 5950
},
{
"epoch": 0.19616533701040742,
"grad_norm": 252.0,
"learning_rate": 4.143679133525131e-06,
"logits/chosen": -4.0731916427612305,
"logits/rejected": -4.357572555541992,
"logps/chosen": -673.31787109375,
"logps/rejected": -456.28924560546875,
"loss": 0.6656,
"rewards/accuracies": 0.5799999833106995,
"rewards/chosen": 0.12726329267024994,
"rewards/margins": 0.1085977777838707,
"rewards/rejected": 0.018665514886379242,
"step": 5975
},
{
"epoch": 0.19698611247907022,
"grad_norm": 162.0,
"learning_rate": 4.139448299204604e-06,
"logits/chosen": -4.230855941772461,
"logits/rejected": -4.283547878265381,
"logps/chosen": -627.782470703125,
"logps/rejected": -425.7085266113281,
"loss": 0.6155,
"rewards/accuracies": 0.7400000095367432,
"rewards/chosen": 0.16935734450817108,
"rewards/margins": 0.20453746616840363,
"rewards/rejected": -0.03518013656139374,
"step": 6000
},
{
"epoch": 0.19780688794773302,
"grad_norm": 89.5,
"learning_rate": 4.135217464884075e-06,
"logits/chosen": -4.026022434234619,
"logits/rejected": -4.211236000061035,
"logps/chosen": -636.9410400390625,
"logps/rejected": -497.6493835449219,
"loss": 0.6299,
"rewards/accuracies": 0.7200000286102295,
"rewards/chosen": 0.20437130331993103,
"rewards/margins": 0.18679769337177277,
"rewards/rejected": 0.01757361926138401,
"step": 6025
},
{
"epoch": 0.19862766341639582,
"grad_norm": 81.5,
"learning_rate": 4.130986630563548e-06,
"logits/chosen": -4.12556266784668,
"logits/rejected": -4.181331157684326,
"logps/chosen": -663.214111328125,
"logps/rejected": -548.843505859375,
"loss": 0.6332,
"rewards/accuracies": 0.6600000262260437,
"rewards/chosen": 0.211769238114357,
"rewards/margins": 0.17643685638904572,
"rewards/rejected": 0.03533238545060158,
"step": 6050
},
{
"epoch": 0.19944843888505862,
"grad_norm": 149.0,
"learning_rate": 4.126755796243019e-06,
"logits/chosen": -4.021080493927002,
"logits/rejected": -4.118467807769775,
"logps/chosen": -780.01611328125,
"logps/rejected": -585.002685546875,
"loss": 0.6105,
"rewards/accuracies": 0.7200000286102295,
"rewards/chosen": 0.2011178731918335,
"rewards/margins": 0.20934166014194489,
"rewards/rejected": -0.008223775774240494,
"step": 6075
},
{
"epoch": 0.2002692143537214,
"grad_norm": 140.0,
"learning_rate": 4.122524961922492e-06,
"logits/chosen": -4.2615885734558105,
"logits/rejected": -4.098018646240234,
"logps/chosen": -739.8490600585938,
"logps/rejected": -538.7686157226562,
"loss": 0.6331,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.22424788773059845,
"rewards/margins": 0.1791938841342926,
"rewards/rejected": 0.045054011046886444,
"step": 6100
},
{
"epoch": 0.20108998982238419,
"grad_norm": 135.0,
"learning_rate": 4.118294127601963e-06,
"logits/chosen": -4.1815314292907715,
"logits/rejected": -4.297692775726318,
"logps/chosen": -692.421875,
"logps/rejected": -482.92730712890625,
"loss": 0.5629,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": 0.23259569704532623,
"rewards/margins": 0.3299049437046051,
"rewards/rejected": -0.09730925410985947,
"step": 6125
},
{
"epoch": 0.20191076529104698,
"grad_norm": 176.0,
"learning_rate": 4.114063293281436e-06,
"logits/chosen": -4.1495208740234375,
"logits/rejected": -4.088111877441406,
"logps/chosen": -725.1746215820312,
"logps/rejected": -486.193603515625,
"loss": 0.6098,
"rewards/accuracies": 0.7200000286102295,
"rewards/chosen": 0.21869854629039764,
"rewards/margins": 0.23580920696258545,
"rewards/rejected": -0.01711067371070385,
"step": 6150
},
{
"epoch": 0.20273154075970978,
"grad_norm": 182.0,
"learning_rate": 4.109832458960907e-06,
"logits/chosen": -4.105493068695068,
"logits/rejected": -4.2606730461120605,
"logps/chosen": -630.2760009765625,
"logps/rejected": -479.7104187011719,
"loss": 0.6216,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.18199868500232697,
"rewards/margins": 0.20924949645996094,
"rewards/rejected": -0.02725079283118248,
"step": 6175
},
{
"epoch": 0.20355231622837255,
"grad_norm": 118.0,
"learning_rate": 4.10560162464038e-06,
"logits/chosen": -4.221208095550537,
"logits/rejected": -4.3337202072143555,
"logps/chosen": -819.2532958984375,
"logps/rejected": -593.1384887695312,
"loss": 0.601,
"rewards/accuracies": 0.8199999928474426,
"rewards/chosen": 0.1887252926826477,
"rewards/margins": 0.24284206330776215,
"rewards/rejected": -0.05411674454808235,
"step": 6200
},
{
"epoch": 0.20437309169703535,
"grad_norm": 183.0,
"learning_rate": 4.101370790319851e-06,
"logits/chosen": -4.222593784332275,
"logits/rejected": -4.143655776977539,
"logps/chosen": -936.6873168945312,
"logps/rejected": -611.08740234375,
"loss": 0.6512,
"rewards/accuracies": 0.6800000071525574,
"rewards/chosen": 0.24604524672031403,
"rewards/margins": 0.1898384541273117,
"rewards/rejected": 0.05620681867003441,
"step": 6225
},
{
"epoch": 0.20519386716569815,
"grad_norm": 120.0,
"learning_rate": 4.097139955999324e-06,
"logits/chosen": -4.276827812194824,
"logits/rejected": -4.2462544441223145,
"logps/chosen": -786.3945922851562,
"logps/rejected": -569.2025756835938,
"loss": 0.6368,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.18757988512516022,
"rewards/margins": 0.16715353727340698,
"rewards/rejected": 0.020426325500011444,
"step": 6250
},
{
"epoch": 0.20601464263436095,
"grad_norm": 264.0,
"learning_rate": 4.092909121678795e-06,
"logits/chosen": -4.115857124328613,
"logits/rejected": -4.1909685134887695,
"logps/chosen": -653.240478515625,
"logps/rejected": -459.0234375,
"loss": 0.6534,
"rewards/accuracies": 0.5799999833106995,
"rewards/chosen": 0.09809459000825882,
"rewards/margins": 0.13492736220359802,
"rewards/rejected": -0.0368327796459198,
"step": 6275
},
{
"epoch": 0.20683541810302375,
"grad_norm": 206.0,
"learning_rate": 4.088678287358268e-06,
"logits/chosen": -4.155333995819092,
"logits/rejected": -4.186524868011475,
"logps/chosen": -736.306396484375,
"logps/rejected": -539.7884521484375,
"loss": 0.6026,
"rewards/accuracies": 0.6399999856948853,
"rewards/chosen": 0.18953242897987366,
"rewards/margins": 0.24376721680164337,
"rewards/rejected": -0.054234765470027924,
"step": 6300
},
{
"epoch": 0.20765619357168652,
"grad_norm": 237.0,
"learning_rate": 4.084447453037739e-06,
"logits/chosen": -4.154298782348633,
"logits/rejected": -4.270554065704346,
"logps/chosen": -870.3758544921875,
"logps/rejected": -743.3199462890625,
"loss": 0.6616,
"rewards/accuracies": 0.5799999833106995,
"rewards/chosen": 0.12642379105091095,
"rewards/margins": 0.09510830044746399,
"rewards/rejected": 0.03131549060344696,
"step": 6325
},
{
"epoch": 0.20847696904034932,
"grad_norm": 205.0,
"learning_rate": 4.080216618717212e-06,
"logits/chosen": -4.2590012550354,
"logits/rejected": -4.194936752319336,
"logps/chosen": -614.5784301757812,
"logps/rejected": -491.8901672363281,
"loss": 0.6684,
"rewards/accuracies": 0.6399999856948853,
"rewards/chosen": 0.13347198069095612,
"rewards/margins": 0.14738866686820984,
"rewards/rejected": -0.01391667127609253,
"step": 6350
},
{
"epoch": 0.20929774450901212,
"grad_norm": 221.0,
"learning_rate": 4.075985784396683e-06,
"logits/chosen": -4.210058212280273,
"logits/rejected": -4.1984453201293945,
"logps/chosen": -720.3945922851562,
"logps/rejected": -589.595947265625,
"loss": 0.6069,
"rewards/accuracies": 0.6800000071525574,
"rewards/chosen": 0.22792109847068787,
"rewards/margins": 0.2285040020942688,
"rewards/rejected": -0.0005828905268572271,
"step": 6375
},
{
"epoch": 0.21011851997767492,
"grad_norm": 290.0,
"learning_rate": 4.071754950076156e-06,
"logits/chosen": -4.021488666534424,
"logits/rejected": -4.137681484222412,
"logps/chosen": -799.5679931640625,
"logps/rejected": -506.8511657714844,
"loss": 0.6242,
"rewards/accuracies": 0.6600000262260437,
"rewards/chosen": 0.154415562748909,
"rewards/margins": 0.17487049102783203,
"rewards/rejected": -0.020454909652471542,
"step": 6400
},
{
"epoch": 0.2109392954463377,
"grad_norm": 150.0,
"learning_rate": 4.067524115755627e-06,
"logits/chosen": -4.259695529937744,
"logits/rejected": -4.3616743087768555,
"logps/chosen": -781.1605224609375,
"logps/rejected": -477.47882080078125,
"loss": 0.5909,
"rewards/accuracies": 0.7400000095367432,
"rewards/chosen": 0.1878763735294342,
"rewards/margins": 0.2647455334663391,
"rewards/rejected": -0.07686912268400192,
"step": 6425
},
{
"epoch": 0.2117600709150005,
"grad_norm": 154.0,
"learning_rate": 4.0632932814351e-06,
"logits/chosen": -4.311985015869141,
"logits/rejected": -4.301010608673096,
"logps/chosen": -793.2703857421875,
"logps/rejected": -579.3934326171875,
"loss": 0.5878,
"rewards/accuracies": 0.7200000286102295,
"rewards/chosen": 0.21181993186473846,
"rewards/margins": 0.28632134199142456,
"rewards/rejected": -0.0745014175772667,
"step": 6450
},
{
"epoch": 0.21258084638366329,
"grad_norm": 217.0,
"learning_rate": 4.059062447114571e-06,
"logits/chosen": -4.188201904296875,
"logits/rejected": -4.189501762390137,
"logps/chosen": -824.2460327148438,
"logps/rejected": -638.919189453125,
"loss": 0.6032,
"rewards/accuracies": 0.7200000286102295,
"rewards/chosen": 0.23749302327632904,
"rewards/margins": 0.26156920194625854,
"rewards/rejected": -0.024076232686638832,
"step": 6475
},
{
"epoch": 0.21340162185232608,
"grad_norm": 155.0,
"learning_rate": 4.054831612794044e-06,
"logits/chosen": -4.130152702331543,
"logits/rejected": -4.191186428070068,
"logps/chosen": -683.3399047851562,
"logps/rejected": -519.9230346679688,
"loss": 0.6279,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.22105403244495392,
"rewards/margins": 0.19117388129234314,
"rewards/rejected": 0.02988017164170742,
"step": 6500
}
],
"logging_steps": 25,
"max_steps": 30459,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}