Llama-3-Instruct-8B-SimPOW-0 / trainer_state.json
RAY2L's picture
Upload folder using huggingface_hub
a884417 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9981298423724285,
"eval_steps": 400,
"global_step": 467,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0021373230029388193,
"grad_norm": 23814117.69119963,
"learning_rate": 2.127659574468085e-08,
"logits/chosen": -1.1381689310073853,
"logits/rejected": -0.9913416504859924,
"logps/chosen": -0.2839311361312866,
"logps/rejected": -0.29555341601371765,
"loss": 305.9593,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.7098277807235718,
"rewards/margins": 0.029055725783109665,
"rewards/rejected": -0.7388835549354553,
"step": 1
},
{
"epoch": 0.010686615014694095,
"grad_norm": 1974395.8030804002,
"learning_rate": 1.0638297872340425e-07,
"logits/chosen": -0.9901005029678345,
"logits/rejected": -0.9188694953918457,
"logps/chosen": -0.26972177624702454,
"logps/rejected": -0.2686304748058319,
"loss": 266.3214,
"rewards/accuracies": 0.546875,
"rewards/chosen": -0.6743044853210449,
"rewards/margins": -0.002728263381868601,
"rewards/rejected": -0.6715761423110962,
"step": 5
},
{
"epoch": 0.02137323002938819,
"grad_norm": 46220091.26670953,
"learning_rate": 2.127659574468085e-07,
"logits/chosen": -0.9833618998527527,
"logits/rejected": -0.9393731951713562,
"logps/chosen": -0.27256160974502563,
"logps/rejected": -0.273215115070343,
"loss": 185.9952,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.6814040541648865,
"rewards/margins": 0.0016337722772732377,
"rewards/rejected": -0.6830377578735352,
"step": 10
},
{
"epoch": 0.03205984504408229,
"grad_norm": 474920.2122214309,
"learning_rate": 3.1914893617021275e-07,
"logits/chosen": -0.963974118232727,
"logits/rejected": -0.9196063876152039,
"logps/chosen": -0.29573556780815125,
"logps/rejected": -0.28305521607398987,
"loss": 125.1317,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.7393389940261841,
"rewards/margins": -0.0317009761929512,
"rewards/rejected": -0.7076379656791687,
"step": 15
},
{
"epoch": 0.04274646005877638,
"grad_norm": 4515133.468491916,
"learning_rate": 4.25531914893617e-07,
"logits/chosen": -0.976075291633606,
"logits/rejected": -0.9759608507156372,
"logps/chosen": -0.2616123557090759,
"logps/rejected": -0.27002111077308655,
"loss": 127.9034,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.6540309190750122,
"rewards/margins": 0.02102179452776909,
"rewards/rejected": -0.6750527620315552,
"step": 20
},
{
"epoch": 0.053433075073470476,
"grad_norm": 5165268.674113416,
"learning_rate": 5.319148936170212e-07,
"logits/chosen": -1.0451716184616089,
"logits/rejected": -1.0216295719146729,
"logps/chosen": -0.28275421261787415,
"logps/rejected": -0.2863079905509949,
"loss": 161.215,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -0.7068854570388794,
"rewards/margins": 0.0088844895362854,
"rewards/rejected": -0.7157700657844543,
"step": 25
},
{
"epoch": 0.06411969008816458,
"grad_norm": 143372948.8120165,
"learning_rate": 6.382978723404255e-07,
"logits/chosen": -1.071578025817871,
"logits/rejected": -0.9856084585189819,
"logps/chosen": -0.2763022780418396,
"logps/rejected": -0.2745462656021118,
"loss": 388.8185,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -0.6907557249069214,
"rewards/margins": -0.0043900711461901665,
"rewards/rejected": -0.6863657236099243,
"step": 30
},
{
"epoch": 0.07480630510285867,
"grad_norm": 1184929.3556330686,
"learning_rate": 7.446808510638297e-07,
"logits/chosen": -1.01273512840271,
"logits/rejected": -0.9335028529167175,
"logps/chosen": -0.27808648347854614,
"logps/rejected": -0.29893654584884644,
"loss": 115.7746,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.695216178894043,
"rewards/margins": 0.05212521553039551,
"rewards/rejected": -0.7473413348197937,
"step": 35
},
{
"epoch": 0.08549292011755276,
"grad_norm": 1877904.3293607633,
"learning_rate": 8.51063829787234e-07,
"logits/chosen": -0.9277470707893372,
"logits/rejected": -0.9166946411132812,
"logps/chosen": -0.2787823975086212,
"logps/rejected": -0.2824743986129761,
"loss": 138.8757,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.6969559788703918,
"rewards/margins": 0.009230067022144794,
"rewards/rejected": -0.7061859965324402,
"step": 40
},
{
"epoch": 0.09617953513224686,
"grad_norm": 2671724.397751134,
"learning_rate": 9.574468085106384e-07,
"logits/chosen": -0.9359474182128906,
"logits/rejected": -0.8535245060920715,
"logps/chosen": -0.33036336302757263,
"logps/rejected": -0.33015647530555725,
"loss": 104.7933,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.8259084820747375,
"rewards/margins": -0.0005173005047254264,
"rewards/rejected": -0.8253911733627319,
"step": 45
},
{
"epoch": 0.10686615014694095,
"grad_norm": 79521388.74298675,
"learning_rate": 9.998741174712533e-07,
"logits/chosen": -0.9259702563285828,
"logits/rejected": -0.9349774122238159,
"logps/chosen": -0.2925248146057129,
"logps/rejected": -0.3076633810997009,
"loss": 175.2819,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.731312096118927,
"rewards/margins": 0.0378464013338089,
"rewards/rejected": -0.7691584825515747,
"step": 50
},
{
"epoch": 0.11755276516163506,
"grad_norm": 327828.43192551495,
"learning_rate": 9.991050648838675e-07,
"logits/chosen": -0.9278720021247864,
"logits/rejected": -0.8686744570732117,
"logps/chosen": -0.2634710669517517,
"logps/rejected": -0.27794915437698364,
"loss": 2695.6354,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.6586776971817017,
"rewards/margins": 0.03619522601366043,
"rewards/rejected": -0.6948728561401367,
"step": 55
},
{
"epoch": 0.12823938017632916,
"grad_norm": 899122.206242127,
"learning_rate": 9.97637968732563e-07,
"logits/chosen": -0.9561047554016113,
"logits/rejected": -0.9336016774177551,
"logps/chosen": -0.2656118869781494,
"logps/rejected": -0.28187674283981323,
"loss": 105.9757,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.6640297174453735,
"rewards/margins": 0.0406620129942894,
"rewards/rejected": -0.7046917080879211,
"step": 60
},
{
"epoch": 0.13892599519102325,
"grad_norm": 377242.1086806779,
"learning_rate": 9.954748808839674e-07,
"logits/chosen": -0.956866443157196,
"logits/rejected": -1.005385398864746,
"logps/chosen": -0.2731708288192749,
"logps/rejected": -0.26419904828071594,
"loss": 108.4874,
"rewards/accuracies": 0.42500001192092896,
"rewards/chosen": -0.682927131652832,
"rewards/margins": -0.02242954447865486,
"rewards/rejected": -0.6604975461959839,
"step": 65
},
{
"epoch": 0.14961261020571734,
"grad_norm": 42496.77007169402,
"learning_rate": 9.926188266120295e-07,
"logits/chosen": -0.9903133511543274,
"logits/rejected": -0.9588413238525391,
"logps/chosen": -0.305401474237442,
"logps/rejected": -0.298237681388855,
"loss": 405.7784,
"rewards/accuracies": 0.42500001192092896,
"rewards/chosen": -0.7635036706924438,
"rewards/margins": -0.017909497022628784,
"rewards/rejected": -0.7455942034721375,
"step": 70
},
{
"epoch": 0.16029922522041143,
"grad_norm": 523140.8331781128,
"learning_rate": 9.890738003669027e-07,
"logits/chosen": -0.9634426236152649,
"logits/rejected": -0.9494821429252625,
"logps/chosen": -0.2741475999355316,
"logps/rejected": -0.2895483672618866,
"loss": 2624.7643,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.685369074344635,
"rewards/margins": 0.038501907140016556,
"rewards/rejected": -0.7238709926605225,
"step": 75
},
{
"epoch": 0.17098584023510552,
"grad_norm": 37571862.32606961,
"learning_rate": 9.848447601883433e-07,
"logits/chosen": -1.0007914304733276,
"logits/rejected": -0.9825354814529419,
"logps/chosen": -0.28798869252204895,
"logps/rejected": -0.28025856614112854,
"loss": 131.06,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -0.7199716567993164,
"rewards/margins": -0.019325237721204758,
"rewards/rejected": -0.7006464600563049,
"step": 80
},
{
"epoch": 0.18167245524979964,
"grad_norm": 8861430.225925114,
"learning_rate": 9.799376207714444e-07,
"logits/chosen": -0.9258670806884766,
"logits/rejected": -0.8755356073379517,
"logps/chosen": -0.2675308287143707,
"logps/rejected": -0.28247857093811035,
"loss": 151.0586,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.6688271760940552,
"rewards/margins": 0.03736928477883339,
"rewards/rejected": -0.7061963081359863,
"step": 85
},
{
"epoch": 0.19235907026449373,
"grad_norm": 322999.5648039634,
"learning_rate": 9.743592451943998e-07,
"logits/chosen": -0.8945444226264954,
"logits/rejected": -0.8326283693313599,
"logps/chosen": -0.2888963222503662,
"logps/rejected": -0.30566543340682983,
"loss": 104.0617,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.7222408056259155,
"rewards/margins": 0.04192278906702995,
"rewards/rejected": -0.764163613319397,
"step": 90
},
{
"epoch": 0.20304568527918782,
"grad_norm": 16202558.227455074,
"learning_rate": 9.681174353198686e-07,
"logits/chosen": -1.0018240213394165,
"logits/rejected": -1.024642825126648,
"logps/chosen": -0.2775546908378601,
"logps/rejected": -0.31214436888694763,
"loss": 185.9518,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.6938868165016174,
"rewards/margins": 0.08647419512271881,
"rewards/rejected": -0.7803609371185303,
"step": 95
},
{
"epoch": 0.2137323002938819,
"grad_norm": 881275.6537233666,
"learning_rate": 9.612209208833646e-07,
"logits/chosen": -1.0630947351455688,
"logits/rejected": -1.0079935789108276,
"logps/chosen": -0.28541457653045654,
"logps/rejected": -0.27989768981933594,
"loss": 377.6484,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.7135364413261414,
"rewards/margins": -0.01379220187664032,
"rewards/rejected": -0.6997443437576294,
"step": 100
},
{
"epoch": 0.224418915308576,
"grad_norm": 1471606.598734741,
"learning_rate": 9.536793472839324e-07,
"logits/chosen": -0.9882336854934692,
"logits/rejected": -0.9416030645370483,
"logps/chosen": -0.2841026186943054,
"logps/rejected": -0.30027633905410767,
"loss": 205.1062,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.7102565169334412,
"rewards/margins": 0.0404343381524086,
"rewards/rejected": -0.7506908178329468,
"step": 105
},
{
"epoch": 0.2351055303232701,
"grad_norm": 313885.99955306284,
"learning_rate": 9.455032620941839e-07,
"logits/chosen": -0.9041908383369446,
"logits/rejected": -0.854825496673584,
"logps/chosen": -0.33143380284309387,
"logps/rejected": -0.3396168053150177,
"loss": 2658.6553,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.8285845518112183,
"rewards/margins": 0.020457318052649498,
"rewards/rejected": -0.8490419387817383,
"step": 110
},
{
"epoch": 0.2457921453379642,
"grad_norm": 682808.4048805884,
"learning_rate": 9.367041003085648e-07,
"logits/chosen": -0.9211395978927612,
"logits/rejected": -0.9187518358230591,
"logps/chosen": -0.2765989303588867,
"logps/rejected": -0.2790865898132324,
"loss": 2651.5432,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.6914974451065063,
"rewards/margins": 0.006219107657670975,
"rewards/rejected": -0.697716474533081,
"step": 115
},
{
"epoch": 0.2564787603526583,
"grad_norm": 527643.1252709947,
"learning_rate": 9.272941683504808e-07,
"logits/chosen": -1.0119305849075317,
"logits/rejected": -0.9751386642456055,
"logps/chosen": -0.28785568475723267,
"logps/rejected": -0.31191155314445496,
"loss": 232.3885,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.7196391820907593,
"rewards/margins": 0.06013970449566841,
"rewards/rejected": -0.7797788381576538,
"step": 120
},
{
"epoch": 0.2671653753673524,
"grad_norm": 57160137.92819305,
"learning_rate": 9.172866268606513e-07,
"logits/chosen": -1.0446767807006836,
"logits/rejected": -1.03428316116333,
"logps/chosen": -0.3408173620700836,
"logps/rejected": -0.33484262228012085,
"loss": 171.7718,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.8520433306694031,
"rewards/margins": -0.014936879277229309,
"rewards/rejected": -0.8371064066886902,
"step": 125
},
{
"epoch": 0.2778519903820465,
"grad_norm": 354260.60461613233,
"learning_rate": 9.066954722907638e-07,
"logits/chosen": -1.0202778577804565,
"logits/rejected": -1.040438175201416,
"logps/chosen": -0.3069685399532318,
"logps/rejected": -0.3384125232696533,
"loss": 152.887,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.7674213647842407,
"rewards/margins": 0.07861001789569855,
"rewards/rejected": -0.8460313677787781,
"step": 130
},
{
"epoch": 0.2885386053967406,
"grad_norm": 4161760.6479958617,
"learning_rate": 8.955355173281707e-07,
"logits/chosen": -0.9897885322570801,
"logits/rejected": -0.9403419494628906,
"logps/chosen": -0.3388122022151947,
"logps/rejected": -0.29513686895370483,
"loss": 127.9212,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.8470304608345032,
"rewards/margins": -0.10918829590082169,
"rewards/rejected": -0.7378422021865845,
"step": 135
},
{
"epoch": 0.2992252204114347,
"grad_norm": 1984829.9018358907,
"learning_rate": 8.838223701790055e-07,
"logits/chosen": -0.9738261103630066,
"logits/rejected": -0.9614647030830383,
"logps/chosen": -0.3720606565475464,
"logps/rejected": -0.3473301827907562,
"loss": 110.1865,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.930151641368866,
"rewards/margins": -0.06182613968849182,
"rewards/rejected": -0.8683255314826965,
"step": 140
},
{
"epoch": 0.30991183542612877,
"grad_norm": 1513820.1920679864,
"learning_rate": 8.71572412738697e-07,
"logits/chosen": -1.0015193223953247,
"logits/rejected": -0.9965044856071472,
"logps/chosen": -0.3061389625072479,
"logps/rejected": -0.3093434274196625,
"loss": 3025.9289,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.7653473615646362,
"rewards/margins": 0.008011135272681713,
"rewards/rejected": -0.7733586430549622,
"step": 145
},
{
"epoch": 0.32059845044082286,
"grad_norm": 1970591.2689892622,
"learning_rate": 8.588027776804058e-07,
"logits/chosen": -0.9444905519485474,
"logits/rejected": -0.9439139366149902,
"logps/chosen": -0.3537100851535797,
"logps/rejected": -0.35441845655441284,
"loss": 221.1807,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.8842751383781433,
"rewards/margins": 0.0017710126703605056,
"rewards/rejected": -0.8860462307929993,
"step": 150
},
{
"epoch": 0.33128506545551695,
"grad_norm": 12808984.8595381,
"learning_rate": 8.455313244934324e-07,
"logits/chosen": -1.0120588541030884,
"logits/rejected": -0.9681800603866577,
"logps/chosen": -0.3124849796295166,
"logps/rejected": -0.3563632667064667,
"loss": 187.1941,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.7812124490737915,
"rewards/margins": 0.10969575494527817,
"rewards/rejected": -0.8909081220626831,
"step": 155
},
{
"epoch": 0.34197168047021104,
"grad_norm": 1183315.8265051153,
"learning_rate": 8.317766145051057e-07,
"logits/chosen": -1.0060454607009888,
"logits/rejected": -0.9961159825325012,
"logps/chosen": -0.2914329171180725,
"logps/rejected": -0.3657309412956238,
"loss": 120.0758,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.7285822629928589,
"rewards/margins": 0.1857450008392334,
"rewards/rejected": -0.9143272638320923,
"step": 160
},
{
"epoch": 0.3526582954849052,
"grad_norm": 2641730.3542562006,
"learning_rate": 8.175578849210894e-07,
"logits/chosen": -1.0412616729736328,
"logits/rejected": -0.9751707315444946,
"logps/chosen": -0.33477407693862915,
"logps/rejected": -0.31863099336624146,
"loss": 222.9428,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.8369352221488953,
"rewards/margins": -0.04035765677690506,
"rewards/rejected": -0.7965775728225708,
"step": 165
},
{
"epoch": 0.36334491049959927,
"grad_norm": 24200663.77841142,
"learning_rate": 8.028950219204099e-07,
"logits/chosen": -1.0198957920074463,
"logits/rejected": -1.0187537670135498,
"logps/chosen": -0.35278764367103577,
"logps/rejected": -0.36079707741737366,
"loss": 175.3017,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.8819690942764282,
"rewards/margins": 0.020023606717586517,
"rewards/rejected": -0.901992678642273,
"step": 170
},
{
"epoch": 0.37403152551429336,
"grad_norm": 423144.2753567647,
"learning_rate": 7.878085328428368e-07,
"logits/chosen": -1.0390782356262207,
"logits/rejected": -1.0463870763778687,
"logps/chosen": -0.3290930390357971,
"logps/rejected": -0.33625391125679016,
"loss": 172.6517,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.8227324485778809,
"rewards/margins": 0.01790226623415947,
"rewards/rejected": -0.840634822845459,
"step": 175
},
{
"epoch": 0.38471814052898745,
"grad_norm": 66473449.795217186,
"learning_rate": 7.723195175075135e-07,
"logits/chosen": -1.074244737625122,
"logits/rejected": -1.0753021240234375,
"logps/chosen": -0.33172592520713806,
"logps/rejected": -0.34797996282577515,
"loss": 173.7831,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.8293148279190063,
"rewards/margins": 0.04063502699136734,
"rewards/rejected": -0.8699499368667603,
"step": 180
},
{
"epoch": 0.39540475554368154,
"grad_norm": 2508265.2610814595,
"learning_rate": 7.564496387029531e-07,
"logits/chosen": -1.105753779411316,
"logits/rejected": -1.0690752267837524,
"logps/chosen": -0.33374324440956116,
"logps/rejected": -0.3317410349845886,
"loss": 130.1423,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.8343580961227417,
"rewards/margins": -0.005005507729947567,
"rewards/rejected": -0.8293525576591492,
"step": 185
},
{
"epoch": 0.40609137055837563,
"grad_norm": 1415188.1564555708,
"learning_rate": 7.402210918896689e-07,
"logits/chosen": -1.1109493970870972,
"logits/rejected": -1.0873216390609741,
"logps/chosen": -0.3215797245502472,
"logps/rejected": -0.34585997462272644,
"loss": 103.3292,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.8039493560791016,
"rewards/margins": 0.06070064380764961,
"rewards/rejected": -0.8646499514579773,
"step": 190
},
{
"epoch": 0.4167779855730697,
"grad_norm": 22405463.240061384,
"learning_rate": 7.236565741578162e-07,
"logits/chosen": -1.0102512836456299,
"logits/rejected": -0.995439350605011,
"logps/chosen": -0.34978950023651123,
"logps/rejected": -0.3523608446121216,
"loss": 144.2631,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.8744736909866333,
"rewards/margins": 0.006428359542042017,
"rewards/rejected": -0.880902111530304,
"step": 195
},
{
"epoch": 0.4274646005877638,
"grad_norm": 364076.0574983151,
"learning_rate": 7.067792524832603e-07,
"logits/chosen": -1.0113328695297241,
"logits/rejected": -1.0017603635787964,
"logps/chosen": -0.3367648124694824,
"logps/rejected": -0.334301233291626,
"loss": 115.4949,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.8419120907783508,
"rewards/margins": -0.006158898584544659,
"rewards/rejected": -0.8357530832290649,
"step": 200
},
{
"epoch": 0.4381512156024579,
"grad_norm": 1540771.9395518457,
"learning_rate": 6.896127313264642e-07,
"logits/chosen": -1.1025888919830322,
"logits/rejected": -1.0531136989593506,
"logps/chosen": -0.32284116744995117,
"logps/rejected": -0.34286996722221375,
"loss": 117.5218,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.8071029782295227,
"rewards/margins": 0.05007190629839897,
"rewards/rejected": -0.8571747541427612,
"step": 205
},
{
"epoch": 0.448837830617152,
"grad_norm": 197109620.3195932,
"learning_rate": 6.721810196195174e-07,
"logits/chosen": -1.1090078353881836,
"logits/rejected": -1.0331655740737915,
"logps/chosen": -0.2971234619617462,
"logps/rejected": -0.3048322796821594,
"loss": 199.9444,
"rewards/accuracies": 0.38749998807907104,
"rewards/chosen": -0.7428085207939148,
"rewards/margins": 0.01927214488387108,
"rewards/rejected": -0.7620807886123657,
"step": 210
},
{
"epoch": 0.45952444563184613,
"grad_norm": 210941.10682300097,
"learning_rate": 6.545084971874736e-07,
"logits/chosen": -1.055289626121521,
"logits/rejected": -0.9978870153427124,
"logps/chosen": -0.31941694021224976,
"logps/rejected": -0.3496856093406677,
"loss": 117.6536,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.7985422015190125,
"rewards/margins": 0.07567177712917328,
"rewards/rejected": -0.8742140531539917,
"step": 215
},
{
"epoch": 0.4702110606465402,
"grad_norm": 17774561.440666944,
"learning_rate": 6.3661988065096e-07,
"logits/chosen": -1.0686638355255127,
"logits/rejected": -1.0178725719451904,
"logps/chosen": -0.33966127038002014,
"logps/rejected": -0.3513311445713043,
"loss": 109.0363,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.8491531610488892,
"rewards/margins": 0.029174691066145897,
"rewards/rejected": -0.8783278465270996,
"step": 220
},
{
"epoch": 0.4808976756612343,
"grad_norm": 1448517.5044393009,
"learning_rate": 6.185401888577487e-07,
"logits/chosen": -1.0401103496551514,
"logits/rejected": -1.0147794485092163,
"logps/chosen": -0.284532368183136,
"logps/rejected": -0.2862989902496338,
"loss": 111.407,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.7113308906555176,
"rewards/margins": 0.004416565410792828,
"rewards/rejected": -0.7157474756240845,
"step": 225
},
{
"epoch": 0.4915842906759284,
"grad_norm": 27408086.19895333,
"learning_rate": 6.002947078916364e-07,
"logits/chosen": -1.1030638217926025,
"logits/rejected": -0.9990617632865906,
"logps/chosen": -0.33675864338874817,
"logps/rejected": -0.3289005756378174,
"loss": 2706.1715,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.841896653175354,
"rewards/margins": -0.01964510791003704,
"rewards/rejected": -0.8222514986991882,
"step": 230
},
{
"epoch": 0.5022709056906225,
"grad_norm": 317843.4109202252,
"learning_rate": 5.819089557075688e-07,
"logits/chosen": -1.0890393257141113,
"logits/rejected": -1.0440254211425781,
"logps/chosen": -0.34919267892837524,
"logps/rejected": -0.3513543903827667,
"loss": 238.4326,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": -0.8729816675186157,
"rewards/margins": 0.005404374096542597,
"rewards/rejected": -0.8783860206604004,
"step": 235
},
{
"epoch": 0.5129575207053166,
"grad_norm": 85019.6413638533,
"learning_rate": 5.634086464424742e-07,
"logits/chosen": -1.1137946844100952,
"logits/rejected": -1.076812505722046,
"logps/chosen": -0.32847946882247925,
"logps/rejected": -0.322710782289505,
"loss": 114.2576,
"rewards/accuracies": 0.38749998807907104,
"rewards/chosen": -0.8211986422538757,
"rewards/margins": -0.014421721920371056,
"rewards/rejected": -0.8067768812179565,
"step": 240
},
{
"epoch": 0.5236441357200107,
"grad_norm": 6299240.513263524,
"learning_rate": 5.448196544517167e-07,
"logits/chosen": -1.1339685916900635,
"logits/rejected": -1.0941574573516846,
"logps/chosen": -0.3057493269443512,
"logps/rejected": -0.3173142373561859,
"loss": 145.971,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": -0.7643733024597168,
"rewards/margins": 0.028912359848618507,
"rewards/rejected": -0.7932857275009155,
"step": 245
},
{
"epoch": 0.5343307507347048,
"grad_norm": 847488.7365746452,
"learning_rate": 5.26167978121472e-07,
"logits/chosen": -1.1527339220046997,
"logits/rejected": -1.114386796951294,
"logps/chosen": -0.3197785019874573,
"logps/rejected": -0.3439500629901886,
"loss": 2684.4785,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.7994462251663208,
"rewards/margins": 0.060428936034440994,
"rewards/rejected": -0.8598750829696655,
"step": 250
},
{
"epoch": 0.5450173657493989,
"grad_norm": 231777.21031654373,
"learning_rate": 5.074797035076318e-07,
"logits/chosen": -1.0928058624267578,
"logits/rejected": -1.079099416732788,
"logps/chosen": -0.3362935483455658,
"logps/rejected": -0.36446088552474976,
"loss": 94.7642,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.8407338261604309,
"rewards/margins": 0.0704183503985405,
"rewards/rejected": -0.911152184009552,
"step": 255
},
{
"epoch": 0.555703980764093,
"grad_norm": 147041.27143323392,
"learning_rate": 4.887809678520975e-07,
"logits/chosen": -1.0703433752059937,
"logits/rejected": -1.0498476028442383,
"logps/chosen": -0.29196763038635254,
"logps/rejected": -0.3219326138496399,
"loss": 84.8449,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.7299190759658813,
"rewards/margins": 0.07491237670183182,
"rewards/rejected": -0.8048315048217773,
"step": 260
},
{
"epoch": 0.566390595778787,
"grad_norm": 435566096526.86523,
"learning_rate": 4.700979230274829e-07,
"logits/chosen": -1.0955275297164917,
"logits/rejected": -1.0940407514572144,
"logps/chosen": -0.3326551914215088,
"logps/rejected": -0.35088759660720825,
"loss": 887.0658,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.831637978553772,
"rewards/margins": 0.04558102414011955,
"rewards/rejected": -0.877219021320343,
"step": 265
},
{
"epoch": 0.5770772107934812,
"grad_norm": 3520938.773348992,
"learning_rate": 4.514566989613559e-07,
"logits/chosen": -1.086938500404358,
"logits/rejected": -1.0884182453155518,
"logps/chosen": -0.3024354875087738,
"logps/rejected": -0.29867392778396606,
"loss": 228.8071,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.7560887336730957,
"rewards/margins": -0.009403971955180168,
"rewards/rejected": -0.7466848492622375,
"step": 270
},
{
"epoch": 0.5877638258081752,
"grad_norm": 62256.73172967315,
"learning_rate": 4.328833670911724e-07,
"logits/chosen": -0.9738815426826477,
"logits/rejected": -0.9222286343574524,
"logps/chosen": -0.2884615659713745,
"logps/rejected": -0.3087163269519806,
"loss": 1419.8171,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.721153974533081,
"rewards/margins": 0.05063692852854729,
"rewards/rejected": -0.7717908620834351,
"step": 275
},
{
"epoch": 0.5984504408228694,
"grad_norm": 803294.007310266,
"learning_rate": 4.144039039010124e-07,
"logits/chosen": -1.1537045240402222,
"logits/rejected": -1.0968632698059082,
"logps/chosen": -0.33216923475265503,
"logps/rejected": -0.2991081774234772,
"loss": 301.7224,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": -0.8304230570793152,
"rewards/margins": -0.08265267312526703,
"rewards/rejected": -0.7477704286575317,
"step": 280
},
{
"epoch": 0.6091370558375635,
"grad_norm": 263927.3164272957,
"learning_rate": 3.960441545911204e-07,
"logits/chosen": -1.044985055923462,
"logits/rejected": -1.0020415782928467,
"logps/chosen": -0.3105274736881256,
"logps/rejected": -0.3335118591785431,
"loss": 104.0724,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.7763187289237976,
"rewards/margins": 0.057460904121398926,
"rewards/rejected": -0.8337796330451965,
"step": 285
},
{
"epoch": 0.6198236708522575,
"grad_norm": 4071370.4437897406,
"learning_rate": 3.778297969310529e-07,
"logits/chosen": -1.071925163269043,
"logits/rejected": -1.0407798290252686,
"logps/chosen": -0.3102174699306488,
"logps/rejected": -0.33250361680984497,
"loss": 156.4641,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.7755436897277832,
"rewards/margins": 0.055715300142765045,
"rewards/rejected": -0.8312589526176453,
"step": 290
},
{
"epoch": 0.6305102858669517,
"grad_norm": 965178.0635991972,
"learning_rate": 3.5978630534699865e-07,
"logits/chosen": -1.1488522291183472,
"logits/rejected": -1.1228643655776978,
"logps/chosen": -0.3139174282550812,
"logps/rejected": -0.32147642970085144,
"loss": 134.6723,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.784793496131897,
"rewards/margins": 0.01889752224087715,
"rewards/rejected": -0.8036910891532898,
"step": 295
},
{
"epoch": 0.6411969008816457,
"grad_norm": 20365408.82604466,
"learning_rate": 3.4193891529348795e-07,
"logits/chosen": -1.0468319654464722,
"logits/rejected": -1.0043448209762573,
"logps/chosen": -0.3311859369277954,
"logps/rejected": -0.3362448513507843,
"loss": 120.3211,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.8279649019241333,
"rewards/margins": 0.012647300958633423,
"rewards/rejected": -0.8406121134757996,
"step": 300
},
{
"epoch": 0.6518835158963399,
"grad_norm": 68905.1696103493,
"learning_rate": 3.243125879593286e-07,
"logits/chosen": -1.1107118129730225,
"logits/rejected": -1.0741993188858032,
"logps/chosen": -0.3406161665916443,
"logps/rejected": -0.32983919978141785,
"loss": 118.2299,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": -0.8515404462814331,
"rewards/margins": -0.026942413300275803,
"rewards/rejected": -0.8245980143547058,
"step": 305
},
{
"epoch": 0.6625701309110339,
"grad_norm": 3378905.921798347,
"learning_rate": 3.069319753571269e-07,
"logits/chosen": -1.069124460220337,
"logits/rejected": -1.110024094581604,
"logps/chosen": -0.3401089012622833,
"logps/rejected": -0.37533271312713623,
"loss": 151.3316,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.8502721786499023,
"rewards/margins": 0.08805962651968002,
"rewards/rejected": -0.9383317828178406,
"step": 310
},
{
"epoch": 0.673256745925728,
"grad_norm": 27112706002.48435,
"learning_rate": 2.898213858452173e-07,
"logits/chosen": -1.2464616298675537,
"logits/rejected": -1.1925503015518188,
"logps/chosen": -0.34163057804107666,
"logps/rejected": -0.34519410133361816,
"loss": 951.2437,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.8540765047073364,
"rewards/margins": 0.00890885479748249,
"rewards/rejected": -0.8629853129386902,
"step": 315
},
{
"epoch": 0.6839433609404221,
"grad_norm": 34759765.70829812,
"learning_rate": 2.730047501302266e-07,
"logits/chosen": -1.1456632614135742,
"logits/rejected": -1.1187629699707031,
"logps/chosen": -0.3103678226470947,
"logps/rejected": -0.35759711265563965,
"loss": 140.3946,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.7759194374084473,
"rewards/margins": 0.11807328462600708,
"rewards/rejected": -0.8939927816390991,
"step": 320
},
{
"epoch": 0.6946299759551162,
"grad_norm": 250533.47771917153,
"learning_rate": 2.5650558779781635e-07,
"logits/chosen": -1.0254387855529785,
"logits/rejected": -1.0484802722930908,
"logps/chosen": -0.3333364427089691,
"logps/rejected": -0.3406026363372803,
"loss": 110.2831,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.8333410024642944,
"rewards/margins": 0.01816548779606819,
"rewards/rejected": -0.8515065312385559,
"step": 325
},
{
"epoch": 0.7053165909698104,
"grad_norm": 112400196.05235167,
"learning_rate": 2.403469744184154e-07,
"logits/chosen": -1.15934157371521,
"logits/rejected": -1.1072492599487305,
"logps/chosen": -0.3412119746208191,
"logps/rejected": -0.37822234630584717,
"loss": 171.9244,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": -0.8530298471450806,
"rewards/margins": 0.09252593666315079,
"rewards/rejected": -0.9455558061599731,
"step": 330
},
{
"epoch": 0.7160032059845044,
"grad_norm": 9446757.557474248,
"learning_rate": 2.2455150927394878e-07,
"logits/chosen": -1.1374088525772095,
"logits/rejected": -1.1560612916946411,
"logps/chosen": -0.3190115988254547,
"logps/rejected": -0.34075072407722473,
"loss": 93.1427,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.7975289225578308,
"rewards/margins": 0.05434788018465042,
"rewards/rejected": -0.8518768548965454,
"step": 335
},
{
"epoch": 0.7266898209991985,
"grad_norm": 16695168.934409656,
"learning_rate": 2.0914128375069722e-07,
"logits/chosen": -1.0877724885940552,
"logits/rejected": -1.0620936155319214,
"logps/chosen": -0.30620652437210083,
"logps/rejected": -0.33592310547828674,
"loss": 120.7583,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.7655162811279297,
"rewards/margins": 0.07429146766662598,
"rewards/rejected": -0.8398076891899109,
"step": 340
},
{
"epoch": 0.7373764360138926,
"grad_norm": 4144624.6729695094,
"learning_rate": 1.9413785044249676e-07,
"logits/chosen": -1.0486472845077515,
"logits/rejected": -1.0094027519226074,
"logps/chosen": -0.30639034509658813,
"logps/rejected": -0.321176141500473,
"loss": 164.1478,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.765975832939148,
"rewards/margins": 0.03696460276842117,
"rewards/rejected": -0.8029405474662781,
"step": 345
},
{
"epoch": 0.7480630510285867,
"grad_norm": 23390472.719695035,
"learning_rate": 1.7956219300748792e-07,
"logits/chosen": -1.1522271633148193,
"logits/rejected": -1.1415410041809082,
"logps/chosen": -0.34453052282333374,
"logps/rejected": -0.3794510066509247,
"loss": 555.5947,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.8613262176513672,
"rewards/margins": 0.08730128407478333,
"rewards/rejected": -0.9486274719238281,
"step": 350
},
{
"epoch": 0.7587496660432808,
"grad_norm": 1280169.965933302,
"learning_rate": 1.6543469682057104e-07,
"logits/chosen": -1.1356937885284424,
"logits/rejected": -1.0751426219940186,
"logps/chosen": -0.3311988413333893,
"logps/rejected": -0.3161237835884094,
"loss": 82.217,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.827997088432312,
"rewards/margins": -0.037687692791223526,
"rewards/rejected": -0.7903094291687012,
"step": 355
},
{
"epoch": 0.7694362810579749,
"grad_norm": 1956574.339469866,
"learning_rate": 1.5177512046261666e-07,
"logits/chosen": -1.1374176740646362,
"logits/rejected": -1.164374589920044,
"logps/chosen": -0.3486614227294922,
"logps/rejected": -0.3802019953727722,
"loss": 102.3601,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -0.8716535568237305,
"rewards/margins": 0.07885149866342545,
"rewards/rejected": -0.9505050778388977,
"step": 360
},
{
"epoch": 0.7801228960726689,
"grad_norm": 84129.78530171402,
"learning_rate": 1.3860256808630427e-07,
"logits/chosen": -1.137064814567566,
"logits/rejected": -1.0832656621932983,
"logps/chosen": -0.32139506936073303,
"logps/rejected": -0.3225245177745819,
"loss": 117.2444,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.8034876585006714,
"rewards/margins": 0.0028236303478479385,
"rewards/rejected": -0.806311309337616,
"step": 365
},
{
"epoch": 0.7908095110873631,
"grad_norm": 4136765.760326951,
"learning_rate": 1.2593546269723647e-07,
"logits/chosen": -1.1603561639785767,
"logits/rejected": -1.1292134523391724,
"logps/chosen": -0.33979296684265137,
"logps/rejected": -0.35157322883605957,
"loss": 106.6815,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.8494824171066284,
"rewards/margins": 0.029450654983520508,
"rewards/rejected": -0.8789331316947937,
"step": 370
},
{
"epoch": 0.8014961261020572,
"grad_norm": 14999761.486437708,
"learning_rate": 1.1379152038770029e-07,
"logits/chosen": -1.1542747020721436,
"logits/rejected": -1.172849416732788,
"logps/chosen": -0.31457456946372986,
"logps/rejected": -0.369545042514801,
"loss": 120.5736,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.7864364385604858,
"rewards/margins": 0.1374262273311615,
"rewards/rejected": -0.923862636089325,
"step": 375
},
{
"epoch": 0.8121827411167513,
"grad_norm": 244556.56746403236,
"learning_rate": 1.0218772555910954e-07,
"logits/chosen": -1.2167600393295288,
"logits/rejected": -1.1646716594696045,
"logps/chosen": -0.37414881587028503,
"logps/rejected": -0.3576270043849945,
"loss": 113.0907,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.9353721737861633,
"rewards/margins": -0.041304655373096466,
"rewards/rejected": -0.8940675854682922,
"step": 380
},
{
"epoch": 0.8228693561314454,
"grad_norm": 1775127.4439502584,
"learning_rate": 9.114030716778432e-08,
"logits/chosen": -1.1325044631958008,
"logits/rejected": -1.110126256942749,
"logps/chosen": -0.3134748637676239,
"logps/rejected": -0.35259318351745605,
"loss": 114.6353,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.7836871147155762,
"rewards/margins": 0.09779568761587143,
"rewards/rejected": -0.8814828991889954,
"step": 385
},
{
"epoch": 0.8335559711461394,
"grad_norm": 138946.22290507445,
"learning_rate": 8.066471602728803e-08,
"logits/chosen": -1.220655083656311,
"logits/rejected": -1.1926963329315186,
"logps/chosen": -0.33643871545791626,
"logps/rejected": -0.3400726318359375,
"loss": 205.2888,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": -0.8410967588424683,
"rewards/margins": 0.009084770455956459,
"rewards/rejected": -0.8501815795898438,
"step": 390
},
{
"epoch": 0.8442425861608336,
"grad_norm": 1993392.8238311838,
"learning_rate": 7.077560319906694e-08,
"logits/chosen": -1.0875790119171143,
"logits/rejected": -1.0107152462005615,
"logps/chosen": -0.3748469948768616,
"logps/rejected": -0.380262553691864,
"loss": 150.502,
"rewards/accuracies": 0.4000000059604645,
"rewards/chosen": -0.9371173977851868,
"rewards/margins": 0.013538897037506104,
"rewards/rejected": -0.950656533241272,
"step": 395
},
{
"epoch": 0.8549292011755276,
"grad_norm": 42068510.12370043,
"learning_rate": 6.148679950161672e-08,
"logits/chosen": -1.1116609573364258,
"logits/rejected": -1.0628454685211182,
"logps/chosen": -0.36634019017219543,
"logps/rejected": -0.35744190216064453,
"loss": 196.6313,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.9158504605293274,
"rewards/margins": -0.022245775908231735,
"rewards/rejected": -0.8936047554016113,
"step": 400
},
{
"epoch": 0.8549292011755276,
"eval_logits/chosen": -1.337195634841919,
"eval_logits/rejected": -1.2981722354888916,
"eval_logps/chosen": -0.3401065170764923,
"eval_logps/rejected": -0.35557428002357483,
"eval_loss": 132.36317443847656,
"eval_rewards/accuracies": 0.5040322542190552,
"eval_rewards/chosen": -0.8502662181854248,
"eval_rewards/margins": 0.038669489324092865,
"eval_rewards/rejected": -0.8889357447624207,
"eval_runtime": 72.0543,
"eval_samples_per_second": 27.216,
"eval_steps_per_second": 0.86,
"step": 400
},
{
"epoch": 0.8656158161902218,
"grad_norm": 5594982.665070933,
"learning_rate": 5.2811296166831666e-08,
"logits/chosen": -1.1403666734695435,
"logits/rejected": -1.0579333305358887,
"logps/chosen": -0.34073713421821594,
"logps/rejected": -0.33352339267730713,
"loss": 122.7746,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.8518427610397339,
"rewards/margins": -0.01803441345691681,
"rewards/rejected": -0.833808422088623,
"step": 405
},
{
"epoch": 0.8763024312049158,
"grad_norm": 365339.7208405054,
"learning_rate": 4.4761226670592066e-08,
"logits/chosen": -1.0983816385269165,
"logits/rejected": -1.0836502313613892,
"logps/chosen": -0.33261579275131226,
"logps/rejected": -0.3417048752307892,
"loss": 111.2202,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.831539511680603,
"rewards/margins": 0.022722622379660606,
"rewards/rejected": -0.8542621731758118,
"step": 410
},
{
"epoch": 0.88698904621961,
"grad_norm": 2850565.0664724754,
"learning_rate": 3.734784976300165e-08,
"logits/chosen": -1.0673637390136719,
"logits/rejected": -1.040725827217102,
"logps/chosen": -0.34171849489212036,
"logps/rejected": -0.348112016916275,
"loss": 191.3689,
"rewards/accuracies": 0.42500001192092896,
"rewards/chosen": -0.8542962074279785,
"rewards/margins": 0.015983855351805687,
"rewards/rejected": -0.8702800869941711,
"step": 415
},
{
"epoch": 0.897675661234304,
"grad_norm": 507688.39572092163,
"learning_rate": 3.058153372200695e-08,
"logits/chosen": -1.1354072093963623,
"logits/rejected": -1.1297013759613037,
"logps/chosen": -0.3288155198097229,
"logps/rejected": -0.3461647629737854,
"loss": 112.2704,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.8220387697219849,
"rewards/margins": 0.04337311536073685,
"rewards/rejected": -0.8654119372367859,
"step": 420
},
{
"epoch": 0.9083622762489981,
"grad_norm": 7451525102.30645,
"learning_rate": 2.4471741852423233e-08,
"logits/chosen": -1.0978658199310303,
"logits/rejected": -1.0448986291885376,
"logps/chosen": -0.3529340624809265,
"logps/rejected": -0.35007423162460327,
"loss": 3170.9543,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.8823351860046387,
"rewards/margins": -0.007149559445679188,
"rewards/rejected": -0.8751856088638306,
"step": 425
},
{
"epoch": 0.9190488912636923,
"grad_norm": 10934704.996058388,
"learning_rate": 1.9027019250647036e-08,
"logits/chosen": -1.1120647192001343,
"logits/rejected": -1.0629384517669678,
"logps/chosen": -0.30386024713516235,
"logps/rejected": -0.31913992762565613,
"loss": 237.2099,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.759650707244873,
"rewards/margins": 0.03819913789629936,
"rewards/rejected": -0.7978497743606567,
"step": 430
},
{
"epoch": 0.9297355062783863,
"grad_norm": 1059239098.6557789,
"learning_rate": 1.4254980853566246e-08,
"logits/chosen": -1.1208689212799072,
"logits/rejected": -1.076522946357727,
"logps/chosen": -0.3260021507740021,
"logps/rejected": -0.32419848442077637,
"loss": 252.7438,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.8150054216384888,
"rewards/margins": -0.004509164486080408,
"rewards/rejected": -0.8104962110519409,
"step": 435
},
{
"epoch": 0.9404221212930804,
"grad_norm": 3163894.8555042273,
"learning_rate": 1.016230078838226e-08,
"logits/chosen": -1.1477479934692383,
"logits/rejected": -1.1257246732711792,
"logps/chosen": -0.3194740116596222,
"logps/rejected": -0.3639461398124695,
"loss": 283.9071,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.7986849546432495,
"rewards/margins": 0.11118029057979584,
"rewards/rejected": -0.9098652601242065,
"step": 440
},
{
"epoch": 0.9511087363077745,
"grad_norm": 313415.0194399257,
"learning_rate": 6.754703038239329e-09,
"logits/chosen": -1.1323530673980713,
"logits/rejected": -1.0702521800994873,
"logps/chosen": -0.323483407497406,
"logps/rejected": -0.3070305287837982,
"loss": 142.406,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -0.8087084889411926,
"rewards/margins": -0.041132211685180664,
"rewards/rejected": -0.7675763368606567,
"step": 445
},
{
"epoch": 0.9617953513224686,
"grad_norm": 312608.20391974325,
"learning_rate": 4.036953436716895e-09,
"logits/chosen": -1.0631930828094482,
"logits/rejected": -1.0676857233047485,
"logps/chosen": -0.34714624285697937,
"logps/rejected": -0.3853607773780823,
"loss": 98.3866,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.8678655624389648,
"rewards/margins": 0.09553632885217667,
"rewards/rejected": -0.9634019136428833,
"step": 450
},
{
"epoch": 0.9724819663371627,
"grad_norm": 98094.25868242758,
"learning_rate": 2.0128530023804656e-09,
"logits/chosen": -1.1627556085586548,
"logits/rejected": -1.131043791770935,
"logps/chosen": -0.34627044200897217,
"logps/rejected": -0.32559770345687866,
"loss": 136.3491,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": -0.8656761050224304,
"rewards/margins": -0.05168183892965317,
"rewards/rejected": -0.813994288444519,
"step": 455
},
{
"epoch": 0.9831685813518568,
"grad_norm": 25583015.430213835,
"learning_rate": 6.852326227130833e-10,
"logits/chosen": -1.1027119159698486,
"logits/rejected": -1.1237401962280273,
"logps/chosen": -0.33976924419403076,
"logps/rejected": -0.3329155147075653,
"loss": 116.0967,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.8494230508804321,
"rewards/margins": -0.01713428646326065,
"rewards/rejected": -0.8322887420654297,
"step": 460
},
{
"epoch": 0.9938551963665508,
"grad_norm": 351352.5694241463,
"learning_rate": 5.594909486328348e-11,
"logits/chosen": -1.212425947189331,
"logits/rejected": -1.1084251403808594,
"logps/chosen": -0.3214932084083557,
"logps/rejected": -0.3376317620277405,
"loss": 2736.6258,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.8037330508232117,
"rewards/margins": 0.04034631699323654,
"rewards/rejected": -0.844079315662384,
"step": 465
},
{
"epoch": 0.9981298423724285,
"step": 467,
"total_flos": 0.0,
"train_loss": 444.38003229635433,
"train_runtime": 7255.1322,
"train_samples_per_second": 8.253,
"train_steps_per_second": 0.064
}
],
"logging_steps": 5,
"max_steps": 467,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}