dpo_7_mar / trainer_state.json
Gege24's picture
Upload task output 19fccc14-8df6-4085-86ee-ce740ccdff30
92f0fc9 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.08977063602495623,
"eval_steps": 500,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0008977063602495623,
"grad_norm": 49.75,
"learning_rate": 1.8797356064157479e-06,
"logits/chosen": -1.5944416522979736,
"logits/rejected": -1.6120755672454834,
"logps/chosen": -218.08145141601562,
"logps/rejected": -238.650634765625,
"loss": 0.69,
"rewards/accuracies": 0.375,
"rewards/chosen": 0.003223979379981756,
"rewards/margins": 0.006512450985610485,
"rewards/rejected": -0.003288471605628729,
"step": 5
},
{
"epoch": 0.0017954127204991247,
"grad_norm": 52.75,
"learning_rate": 4.229405114435433e-06,
"logits/chosen": -1.638082504272461,
"logits/rejected": -1.644774079322815,
"logps/chosen": -218.1611785888672,
"logps/rejected": -238.66098022460938,
"loss": 0.6066,
"rewards/accuracies": 0.9375,
"rewards/chosen": 0.07871033251285553,
"rewards/margins": 0.1871051788330078,
"rewards/rejected": -0.10839483886957169,
"step": 10
},
{
"epoch": 0.002693119080748687,
"grad_norm": 28.625,
"learning_rate": 6.579074622455118e-06,
"logits/chosen": -1.7269313335418701,
"logits/rejected": -1.7303335666656494,
"logps/chosen": -226.7982635498047,
"logps/rejected": -258.6452331542969,
"loss": 0.3006,
"rewards/accuracies": 0.965624988079071,
"rewards/chosen": -0.6204186081886292,
"rewards/margins": 1.243154764175415,
"rewards/rejected": -1.8635733127593994,
"step": 15
},
{
"epoch": 0.0035908254409982494,
"grad_norm": 7.84375,
"learning_rate": 8.928744130474802e-06,
"logits/chosen": -1.7967208623886108,
"logits/rejected": -1.814859390258789,
"logps/chosen": -229.9993133544922,
"logps/rejected": -288.5595397949219,
"loss": 0.0873,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -1.8708345890045166,
"rewards/margins": 3.8205904960632324,
"rewards/rejected": -5.691425323486328,
"step": 20
},
{
"epoch": 0.004488531801247812,
"grad_norm": 14.875,
"learning_rate": 1.1278413638494489e-05,
"logits/chosen": -1.7308677434921265,
"logits/rejected": -1.7561269998550415,
"logps/chosen": -243.46029663085938,
"logps/rejected": -340.27764892578125,
"loss": 0.018,
"rewards/accuracies": 0.984375,
"rewards/chosen": -3.0787580013275146,
"rewards/margins": 7.586331367492676,
"rewards/rejected": -10.665090560913086,
"step": 25
},
{
"epoch": 0.005386238161497374,
"grad_norm": 8.25,
"learning_rate": 1.3628083146514173e-05,
"logits/chosen": -1.6984357833862305,
"logits/rejected": -1.7242708206176758,
"logps/chosen": -264.53125,
"logps/rejected": -376.6803894042969,
"loss": 0.0285,
"rewards/accuracies": 0.981249988079071,
"rewards/chosen": -4.747864246368408,
"rewards/margins": 9.422819137573242,
"rewards/rejected": -14.170684814453125,
"step": 30
},
{
"epoch": 0.006283944521746937,
"grad_norm": 4.1875,
"learning_rate": 1.5977752654533858e-05,
"logits/chosen": -1.6455342769622803,
"logits/rejected": -1.6728187799453735,
"logps/chosen": -281.365966796875,
"logps/rejected": -407.96337890625,
"loss": 0.0216,
"rewards/accuracies": 0.984375,
"rewards/chosen": -6.47125768661499,
"rewards/margins": 10.662993431091309,
"rewards/rejected": -17.13425064086914,
"step": 35
},
{
"epoch": 0.007181650881996499,
"grad_norm": 6.3125,
"learning_rate": 1.6447684804072058e-05,
"logits/chosen": -1.5919939279556274,
"logits/rejected": -1.617920160293579,
"logps/chosen": -278.3464660644531,
"logps/rejected": -404.8271484375,
"loss": 0.0342,
"rewards/accuracies": 0.9781249761581421,
"rewards/chosen": -6.385420799255371,
"rewards/margins": 10.654914855957031,
"rewards/rejected": -17.040334701538086,
"step": 40
},
{
"epoch": 0.00807935724224606,
"grad_norm": 6.625,
"learning_rate": 1.6447677686306693e-05,
"logits/chosen": -1.6035076379776,
"logits/rejected": -1.6163572072982788,
"logps/chosen": -279.767822265625,
"logps/rejected": -383.13458251953125,
"loss": 0.0432,
"rewards/accuracies": 0.9593750238418579,
"rewards/chosen": -5.704493999481201,
"rewards/margins": 8.454301834106445,
"rewards/rejected": -14.158796310424805,
"step": 45
},
{
"epoch": 0.008977063602495623,
"grad_norm": 4.8125,
"learning_rate": 1.6447665093343918e-05,
"logits/chosen": -1.6678664684295654,
"logits/rejected": -1.6700479984283447,
"logps/chosen": -264.4732971191406,
"logps/rejected": -363.1940612792969,
"loss": 0.0464,
"rewards/accuracies": 0.971875011920929,
"rewards/chosen": -5.7117018699646,
"rewards/margins": 7.980559349060059,
"rewards/rejected": -13.692262649536133,
"step": 50
},
{
"epoch": 0.009874769962745186,
"grad_norm": 4.65625,
"learning_rate": 1.6447647025194904e-05,
"logits/chosen": -1.5799241065979004,
"logits/rejected": -1.5821675062179565,
"logps/chosen": -268.1691589355469,
"logps/rejected": -374.00518798828125,
"loss": 0.0205,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -5.694095134735107,
"rewards/margins": 8.601041793823242,
"rewards/rejected": -14.295137405395508,
"step": 55
},
{
"epoch": 0.010772476322994749,
"grad_norm": 4.625,
"learning_rate": 1.6447623481875693e-05,
"logits/chosen": -1.615523338317871,
"logits/rejected": -1.6053167581558228,
"logps/chosen": -269.4774475097656,
"logps/rejected": -389.2843933105469,
"loss": 0.0186,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -5.536202907562256,
"rewards/margins": 9.901906967163086,
"rewards/rejected": -15.4381103515625,
"step": 60
},
{
"epoch": 0.011670182683244311,
"grad_norm": 5.0625,
"learning_rate": 1.644759446340718e-05,
"logits/chosen": -1.62222158908844,
"logits/rejected": -1.6126108169555664,
"logps/chosen": -273.30316162109375,
"logps/rejected": -401.03851318359375,
"loss": 0.0237,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -5.82913875579834,
"rewards/margins": 10.689005851745605,
"rewards/rejected": -16.518144607543945,
"step": 65
},
{
"epoch": 0.012567889043493874,
"grad_norm": 5.0625,
"learning_rate": 1.644755996981513e-05,
"logits/chosen": -1.6640081405639648,
"logits/rejected": -1.6507833003997803,
"logps/chosen": -274.74053955078125,
"logps/rejected": -404.7392883300781,
"loss": 0.0319,
"rewards/accuracies": 0.971875011920929,
"rewards/chosen": -5.577990531921387,
"rewards/margins": 10.969307899475098,
"rewards/rejected": -16.547298431396484,
"step": 70
},
{
"epoch": 0.013465595403743435,
"grad_norm": 30.25,
"learning_rate": 1.6447520001130158e-05,
"logits/chosen": -1.5772068500518799,
"logits/rejected": -1.5707197189331055,
"logps/chosen": -278.74658203125,
"logps/rejected": -406.4679260253906,
"loss": 0.0555,
"rewards/accuracies": 0.971875011920929,
"rewards/chosen": -6.121307849884033,
"rewards/margins": 10.843521118164062,
"rewards/rejected": -16.96483039855957,
"step": 75
},
{
"epoch": 0.014363301763992998,
"grad_norm": 3.109375,
"learning_rate": 1.6447474557387748e-05,
"logits/chosen": -1.53738534450531,
"logits/rejected": -1.534790277481079,
"logps/chosen": -314.1097717285156,
"logps/rejected": -420.47308349609375,
"loss": 0.0185,
"rewards/accuracies": 0.9906250238418579,
"rewards/chosen": -9.349452018737793,
"rewards/margins": 8.684282302856445,
"rewards/rejected": -18.033733367919922,
"step": 80
},
{
"epoch": 0.01526100812424256,
"grad_norm": 4.03125,
"learning_rate": 1.6447423638628237e-05,
"logits/chosen": -1.5148117542266846,
"logits/rejected": -1.5227998495101929,
"logps/chosen": -316.7289123535156,
"logps/rejected": -430.1024475097656,
"loss": 0.0115,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -10.431829452514648,
"rewards/margins": 9.403815269470215,
"rewards/rejected": -19.835643768310547,
"step": 85
},
{
"epoch": 0.01615871448449212,
"grad_norm": 3.84375,
"learning_rate": 1.6447367244896826e-05,
"logits/chosen": -1.606527328491211,
"logits/rejected": -1.6100928783416748,
"logps/chosen": -319.0159606933594,
"logps/rejected": -440.75738525390625,
"loss": 0.0257,
"rewards/accuracies": 0.984375,
"rewards/chosen": -10.429452896118164,
"rewards/margins": 10.196511268615723,
"rewards/rejected": -20.625965118408203,
"step": 90
},
{
"epoch": 0.017056420844741686,
"grad_norm": 24.125,
"learning_rate": 1.644730537624358e-05,
"logits/chosen": -1.6584317684173584,
"logits/rejected": -1.6650241613388062,
"logps/chosen": -315.8913269042969,
"logps/rejected": -439.966552734375,
"loss": 0.0371,
"rewards/accuracies": 0.984375,
"rewards/chosen": -10.410109519958496,
"rewards/margins": 10.561999320983887,
"rewards/rejected": -20.972110748291016,
"step": 95
},
{
"epoch": 0.017954127204991246,
"grad_norm": 13.375,
"learning_rate": 1.644723803272341e-05,
"logits/chosen": -1.6700522899627686,
"logits/rejected": -1.6734033823013306,
"logps/chosen": -319.60516357421875,
"logps/rejected": -441.1056213378906,
"loss": 0.0222,
"rewards/accuracies": 0.981249988079071,
"rewards/chosen": -10.69153881072998,
"rewards/margins": 10.231417655944824,
"rewards/rejected": -20.922958374023438,
"step": 100
},
{
"epoch": 0.01885183356524081,
"grad_norm": 6.3125,
"learning_rate": 1.644716521439611e-05,
"logits/chosen": -1.7060085535049438,
"logits/rejected": -1.7089792490005493,
"logps/chosen": -347.458984375,
"logps/rejected": -468.8707580566406,
"loss": 0.0224,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -12.833531379699707,
"rewards/margins": 10.041297912597656,
"rewards/rejected": -22.874828338623047,
"step": 105
},
{
"epoch": 0.019749539925490372,
"grad_norm": 10.8125,
"learning_rate": 1.644708692132631e-05,
"logits/chosen": -1.7060142755508423,
"logits/rejected": -1.7084630727767944,
"logps/chosen": -356.79656982421875,
"logps/rejected": -479.97235107421875,
"loss": 0.0264,
"rewards/accuracies": 0.9906250238418579,
"rewards/chosen": -13.447436332702637,
"rewards/margins": 10.534585952758789,
"rewards/rejected": -23.982025146484375,
"step": 110
},
{
"epoch": 0.020647246285739933,
"grad_norm": 6.15625,
"learning_rate": 1.6447003153583514e-05,
"logits/chosen": -1.642289161682129,
"logits/rejected": -1.6480754613876343,
"logps/chosen": -356.51251220703125,
"logps/rejected": -481.41827392578125,
"loss": 0.0122,
"rewards/accuracies": 0.9906250238418579,
"rewards/chosen": -13.347066879272461,
"rewards/margins": 10.536771774291992,
"rewards/rejected": -23.883838653564453,
"step": 115
},
{
"epoch": 0.021544952645989497,
"grad_norm": 8.3125,
"learning_rate": 1.644691391124208e-05,
"logits/chosen": -1.6251140832901,
"logits/rejected": -1.6293065547943115,
"logps/chosen": -362.5952453613281,
"logps/rejected": -487.474853515625,
"loss": 0.0638,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -14.098132133483887,
"rewards/margins": 10.43175983428955,
"rewards/rejected": -24.529891967773438,
"step": 120
},
{
"epoch": 0.022442659006239058,
"grad_norm": 0.9375,
"learning_rate": 1.6446819194381232e-05,
"logits/chosen": -1.6321861743927002,
"logits/rejected": -1.6401519775390625,
"logps/chosen": -365.3531188964844,
"logps/rejected": -472.98590087890625,
"loss": 0.0341,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -14.91484546661377,
"rewards/margins": 8.75818920135498,
"rewards/rejected": -23.673038482666016,
"step": 125
},
{
"epoch": 0.023340365366488623,
"grad_norm": 16.125,
"learning_rate": 1.6446719003085048e-05,
"logits/chosen": -1.682080864906311,
"logits/rejected": -1.6901333332061768,
"logps/chosen": -375.67047119140625,
"logps/rejected": -475.9076232910156,
"loss": 0.0263,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -14.836702346801758,
"rewards/margins": 8.173124313354492,
"rewards/rejected": -23.00982666015625,
"step": 130
},
{
"epoch": 0.024238071726738183,
"grad_norm": 5.09375,
"learning_rate": 1.6446613337442464e-05,
"logits/chosen": -1.7631546258926392,
"logits/rejected": -1.75924813747406,
"logps/chosen": -334.9482421875,
"logps/rejected": -438.7393493652344,
"loss": 0.0274,
"rewards/accuracies": 0.981249988079071,
"rewards/chosen": -12.050386428833008,
"rewards/margins": 8.398730278015137,
"rewards/rejected": -20.44911766052246,
"step": 135
},
{
"epoch": 0.025135778086987748,
"grad_norm": 16.125,
"learning_rate": 1.6446502197547285e-05,
"logits/chosen": -1.6700756549835205,
"logits/rejected": -1.6583993434906006,
"logps/chosen": -327.59222412109375,
"logps/rejected": -443.2696228027344,
"loss": 0.0349,
"rewards/accuracies": 0.981249988079071,
"rewards/chosen": -10.80932903289795,
"rewards/margins": 9.692026138305664,
"rewards/rejected": -20.501354217529297,
"step": 140
},
{
"epoch": 0.02603348444723731,
"grad_norm": 6.5625,
"learning_rate": 1.6446385583498166e-05,
"logits/chosen": -1.603623628616333,
"logits/rejected": -1.5888742208480835,
"logps/chosen": -325.58660888671875,
"logps/rejected": -446.7493591308594,
"loss": 0.0473,
"rewards/accuracies": 0.96875,
"rewards/chosen": -11.347869873046875,
"rewards/margins": 10.037898063659668,
"rewards/rejected": -21.38576889038086,
"step": 145
},
{
"epoch": 0.02693119080748687,
"grad_norm": 5.21875,
"learning_rate": 1.6446263495398625e-05,
"logits/chosen": -1.6120811700820923,
"logits/rejected": -1.5870082378387451,
"logps/chosen": -317.60162353515625,
"logps/rejected": -438.42633056640625,
"loss": 0.0193,
"rewards/accuracies": 0.984375,
"rewards/chosen": -9.432976722717285,
"rewards/margins": 10.237409591674805,
"rewards/rejected": -19.670385360717773,
"step": 150
},
{
"epoch": 0.027828897167736434,
"grad_norm": 3.4375,
"learning_rate": 1.644613593335704e-05,
"logits/chosen": -1.5875444412231445,
"logits/rejected": -1.5749518871307373,
"logps/chosen": -303.3853454589844,
"logps/rejected": -432.47808837890625,
"loss": 0.0154,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -8.602149963378906,
"rewards/margins": 10.840978622436523,
"rewards/rejected": -19.443126678466797,
"step": 155
},
{
"epoch": 0.028726603527985995,
"grad_norm": 3.9375,
"learning_rate": 1.6446002897486648e-05,
"logits/chosen": -1.618011236190796,
"logits/rejected": -1.6145331859588623,
"logps/chosen": -312.6946105957031,
"logps/rejected": -446.9215393066406,
"loss": 0.0265,
"rewards/accuracies": 0.981249988079071,
"rewards/chosen": -8.892807960510254,
"rewards/margins": 11.515321731567383,
"rewards/rejected": -20.408130645751953,
"step": 160
},
{
"epoch": 0.02962430988823556,
"grad_norm": 5.96875,
"learning_rate": 1.644586438790554e-05,
"logits/chosen": -1.5836814641952515,
"logits/rejected": -1.587181806564331,
"logps/chosen": -306.9125061035156,
"logps/rejected": -445.1659240722656,
"loss": 0.0256,
"rewards/accuracies": 0.981249988079071,
"rewards/chosen": -9.431074142456055,
"rewards/margins": 11.796531677246094,
"rewards/rejected": -21.22760581970215,
"step": 165
},
{
"epoch": 0.03052201624848512,
"grad_norm": 11.125,
"learning_rate": 1.6445720404736678e-05,
"logits/chosen": -1.6508190631866455,
"logits/rejected": -1.65244460105896,
"logps/chosen": -310.2176208496094,
"logps/rejected": -440.75714111328125,
"loss": 0.0288,
"rewards/accuracies": 0.981249988079071,
"rewards/chosen": -9.615917205810547,
"rewards/margins": 11.127912521362305,
"rewards/rejected": -20.743831634521484,
"step": 170
},
{
"epoch": 0.031419722608734685,
"grad_norm": 7.0,
"learning_rate": 1.644557094810787e-05,
"logits/chosen": -1.7216987609863281,
"logits/rejected": -1.7154676914215088,
"logps/chosen": -340.46466064453125,
"logps/rejected": -450.27294921875,
"loss": 0.0397,
"rewards/accuracies": 0.9906250238418579,
"rewards/chosen": -11.357550621032715,
"rewards/margins": 8.966830253601074,
"rewards/rejected": -20.32438087463379,
"step": 175
},
{
"epoch": 0.03231742896898424,
"grad_norm": 24.0,
"learning_rate": 1.6445416018151788e-05,
"logits/chosen": -1.7959930896759033,
"logits/rejected": -1.800244927406311,
"logps/chosen": -331.75506591796875,
"logps/rejected": -427.2474670410156,
"loss": 0.0207,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -12.001955032348633,
"rewards/margins": 7.540495872497559,
"rewards/rejected": -19.542451858520508,
"step": 180
},
{
"epoch": 0.03321513532923381,
"grad_norm": 43.75,
"learning_rate": 1.644525561500596e-05,
"logits/chosen": -1.9910930395126343,
"logits/rejected": -1.9894500970840454,
"logps/chosen": -350.335693359375,
"logps/rejected": -449.16796875,
"loss": 0.0458,
"rewards/accuracies": 0.96875,
"rewards/chosen": -13.615007400512695,
"rewards/margins": 8.14229679107666,
"rewards/rejected": -21.757305145263672,
"step": 185
},
{
"epoch": 0.03411284168948337,
"grad_norm": 13.3125,
"learning_rate": 1.6445089738812785e-05,
"logits/chosen": -1.9771511554718018,
"logits/rejected": -1.9749290943145752,
"logps/chosen": -348.55828857421875,
"logps/rejected": -453.4300842285156,
"loss": 0.0432,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -13.186309814453125,
"rewards/margins": 8.536267280578613,
"rewards/rejected": -21.722576141357422,
"step": 190
},
{
"epoch": 0.035010548049732935,
"grad_norm": 0.146484375,
"learning_rate": 1.6444918389719505e-05,
"logits/chosen": -1.9536895751953125,
"logits/rejected": -1.9450359344482422,
"logps/chosen": -335.36553955078125,
"logps/rejected": -448.2103576660156,
"loss": 0.0296,
"rewards/accuracies": 0.984375,
"rewards/chosen": -11.7564697265625,
"rewards/margins": 9.423359870910645,
"rewards/rejected": -21.179828643798828,
"step": 195
},
{
"epoch": 0.03590825440998249,
"grad_norm": 20.25,
"learning_rate": 1.644474156787822e-05,
"logits/chosen": -1.878861665725708,
"logits/rejected": -1.8584403991699219,
"logps/chosen": -319.38067626953125,
"logps/rejected": -443.59033203125,
"loss": 0.0307,
"rewards/accuracies": 0.981249988079071,
"rewards/chosen": -10.937708854675293,
"rewards/margins": 10.651717185974121,
"rewards/rejected": -21.589426040649414,
"step": 200
},
{
"epoch": 0.03680596077023206,
"grad_norm": 17.875,
"learning_rate": 1.6444559273445908e-05,
"logits/chosen": -1.6908838748931885,
"logits/rejected": -1.6792293787002563,
"logps/chosen": -321.28009033203125,
"logps/rejected": -460.1475524902344,
"loss": 0.022,
"rewards/accuracies": 0.9906250238418579,
"rewards/chosen": -10.206222534179688,
"rewards/margins": 11.932024955749512,
"rewards/rejected": -22.138248443603516,
"step": 205
},
{
"epoch": 0.03770366713048162,
"grad_norm": 7.0,
"learning_rate": 1.6444371506584377e-05,
"logits/chosen": -1.6957308053970337,
"logits/rejected": -1.690157175064087,
"logps/chosen": -290.0798034667969,
"logps/rejected": -412.7769470214844,
"loss": 0.018,
"rewards/accuracies": 0.984375,
"rewards/chosen": -7.47702693939209,
"rewards/margins": 10.390680313110352,
"rewards/rejected": -17.86771011352539,
"step": 210
},
{
"epoch": 0.03860137349073118,
"grad_norm": 8.5625,
"learning_rate": 1.644417826746031e-05,
"logits/chosen": -1.650665521621704,
"logits/rejected": -1.6541109085083008,
"logps/chosen": -290.309326171875,
"logps/rejected": -417.1390686035156,
"loss": 0.0184,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -7.279687404632568,
"rewards/margins": 10.711103439331055,
"rewards/rejected": -17.990793228149414,
"step": 215
},
{
"epoch": 0.039499079850980744,
"grad_norm": 8.4375,
"learning_rate": 1.6443979556245252e-05,
"logits/chosen": -1.6047160625457764,
"logits/rejected": -1.6234970092773438,
"logps/chosen": -322.4940185546875,
"logps/rejected": -458.4981384277344,
"loss": 0.0263,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -9.725212097167969,
"rewards/margins": 11.667040824890137,
"rewards/rejected": -21.39225196838379,
"step": 220
},
{
"epoch": 0.04039678621123031,
"grad_norm": 23.5,
"learning_rate": 1.6443775373115592e-05,
"logits/chosen": -1.5689036846160889,
"logits/rejected": -1.5908584594726562,
"logps/chosen": -343.6766052246094,
"logps/rejected": -472.40948486328125,
"loss": 0.0679,
"rewards/accuracies": 0.9781249761581421,
"rewards/chosen": -11.692893028259277,
"rewards/margins": 10.971635818481445,
"rewards/rejected": -22.66452980041504,
"step": 225
},
{
"epoch": 0.041294492571479866,
"grad_norm": 8.6875,
"learning_rate": 1.6443565718252586e-05,
"logits/chosen": -1.5273631811141968,
"logits/rejected": -1.5362848043441772,
"logps/chosen": -333.0375061035156,
"logps/rejected": -450.404541015625,
"loss": 0.0259,
"rewards/accuracies": 0.984375,
"rewards/chosen": -11.445978164672852,
"rewards/margins": 9.806886672973633,
"rewards/rejected": -21.25286293029785,
"step": 230
},
{
"epoch": 0.04219219893172943,
"grad_norm": 9.25,
"learning_rate": 1.644335059184234e-05,
"logits/chosen": -1.4887597560882568,
"logits/rejected": -1.5063436031341553,
"logps/chosen": -334.2455139160156,
"logps/rejected": -466.03326416015625,
"loss": 0.0239,
"rewards/accuracies": 0.984375,
"rewards/chosen": -12.205533981323242,
"rewards/margins": 10.970166206359863,
"rewards/rejected": -23.175701141357422,
"step": 235
},
{
"epoch": 0.043089905291978994,
"grad_norm": 5.125,
"learning_rate": 1.644312999407582e-05,
"logits/chosen": -1.4916335344314575,
"logits/rejected": -1.5103265047073364,
"logps/chosen": -339.73175048828125,
"logps/rejected": -466.05645751953125,
"loss": 0.0523,
"rewards/accuracies": 0.981249988079071,
"rewards/chosen": -12.136640548706055,
"rewards/margins": 10.656683921813965,
"rewards/rejected": -22.793325424194336,
"step": 240
},
{
"epoch": 0.04398761165222856,
"grad_norm": 6.1875,
"learning_rate": 1.644290392514886e-05,
"logits/chosen": -1.4491441249847412,
"logits/rejected": -1.4810426235198975,
"logps/chosen": -330.1545104980469,
"logps/rejected": -447.7718811035156,
"loss": 0.0283,
"rewards/accuracies": 0.984375,
"rewards/chosen": -11.471087455749512,
"rewards/margins": 9.670295715332031,
"rewards/rejected": -21.14138412475586,
"step": 245
},
{
"epoch": 0.044885318012478116,
"grad_norm": 9.75,
"learning_rate": 1.6442672385262126e-05,
"logits/chosen": -1.3768192529678345,
"logits/rejected": -1.4130717515945435,
"logps/chosen": -315.2447204589844,
"logps/rejected": -440.4739685058594,
"loss": 0.0267,
"rewards/accuracies": 0.981249988079071,
"rewards/chosen": -9.256295204162598,
"rewards/margins": 10.560578346252441,
"rewards/rejected": -19.81687355041504,
"step": 250
},
{
"epoch": 0.04578302437272768,
"grad_norm": 1.8125,
"learning_rate": 1.6442435374621164e-05,
"logits/chosen": -1.3219325542449951,
"logits/rejected": -1.3581187725067139,
"logps/chosen": -295.7016906738281,
"logps/rejected": -422.5862731933594,
"loss": 0.0325,
"rewards/accuracies": 0.984375,
"rewards/chosen": -8.51601791381836,
"rewards/margins": 10.66891098022461,
"rewards/rejected": -19.18492889404297,
"step": 255
},
{
"epoch": 0.046680730732977245,
"grad_norm": 1.265625,
"learning_rate": 1.6442192893436368e-05,
"logits/chosen": -1.2778997421264648,
"logits/rejected": -1.312280535697937,
"logps/chosen": -303.11151123046875,
"logps/rejected": -420.34912109375,
"loss": 0.024,
"rewards/accuracies": 0.984375,
"rewards/chosen": -8.392851829528809,
"rewards/margins": 9.67725658416748,
"rewards/rejected": -18.070110321044922,
"step": 260
},
{
"epoch": 0.0475784370932268,
"grad_norm": 2.765625,
"learning_rate": 1.644194494192298e-05,
"logits/chosen": -1.2928860187530518,
"logits/rejected": -1.3260154724121094,
"logps/chosen": -286.2437438964844,
"logps/rejected": -393.56231689453125,
"loss": 0.0281,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -7.124878883361816,
"rewards/margins": 8.837203979492188,
"rewards/rejected": -15.962081909179688,
"step": 265
},
{
"epoch": 0.04847614345347637,
"grad_norm": 16.125,
"learning_rate": 1.6441691520301115e-05,
"logits/chosen": -1.278626799583435,
"logits/rejected": -1.3031818866729736,
"logps/chosen": -310.00946044921875,
"logps/rejected": -427.1468200683594,
"loss": 0.0328,
"rewards/accuracies": 0.981249988079071,
"rewards/chosen": -9.098076820373535,
"rewards/margins": 9.82015609741211,
"rewards/rejected": -18.918231964111328,
"step": 270
},
{
"epoch": 0.04937384981372593,
"grad_norm": 2.625,
"learning_rate": 1.644143262879573e-05,
"logits/chosen": -1.355022668838501,
"logits/rejected": -1.377715826034546,
"logps/chosen": -317.8216247558594,
"logps/rejected": -431.4881896972656,
"loss": 0.0385,
"rewards/accuracies": 0.9781249761581421,
"rewards/chosen": -9.668425559997559,
"rewards/margins": 9.433794975280762,
"rewards/rejected": -19.102222442626953,
"step": 275
},
{
"epoch": 0.050271556173975496,
"grad_norm": 9.75,
"learning_rate": 1.644116826763664e-05,
"logits/chosen": -1.3636281490325928,
"logits/rejected": -1.384377360343933,
"logps/chosen": -302.4226379394531,
"logps/rejected": -413.4556579589844,
"loss": 0.0475,
"rewards/accuracies": 0.971875011920929,
"rewards/chosen": -9.204570770263672,
"rewards/margins": 9.083320617675781,
"rewards/rejected": -18.287891387939453,
"step": 280
},
{
"epoch": 0.05116926253422505,
"grad_norm": 8.125,
"learning_rate": 1.6440898437058523e-05,
"logits/chosen": -1.340986728668213,
"logits/rejected": -1.3553143739700317,
"logps/chosen": -313.5350341796875,
"logps/rejected": -417.8633728027344,
"loss": 0.0624,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": -8.931371688842773,
"rewards/margins": 8.754600524902344,
"rewards/rejected": -17.68597412109375,
"step": 285
},
{
"epoch": 0.05206696889447462,
"grad_norm": 1.4609375,
"learning_rate": 1.64406231373009e-05,
"logits/chosen": -1.3220717906951904,
"logits/rejected": -1.3361364603042603,
"logps/chosen": -301.2831726074219,
"logps/rejected": -406.24481201171875,
"loss": 0.033,
"rewards/accuracies": 0.981249988079071,
"rewards/chosen": -9.154109001159668,
"rewards/margins": 8.63221263885498,
"rewards/rejected": -17.78632164001465,
"step": 290
},
{
"epoch": 0.05296467525472418,
"grad_norm": 5.15625,
"learning_rate": 1.6440342368608156e-05,
"logits/chosen": -1.2657798528671265,
"logits/rejected": -1.2791422605514526,
"logps/chosen": -326.9562072753906,
"logps/rejected": -439.09356689453125,
"loss": 0.061,
"rewards/accuracies": 0.965624988079071,
"rewards/chosen": -11.56185531616211,
"rewards/margins": 9.128196716308594,
"rewards/rejected": -20.690053939819336,
"step": 295
},
{
"epoch": 0.05386238161497374,
"grad_norm": 0.0198974609375,
"learning_rate": 1.6440056131229532e-05,
"logits/chosen": -1.2754865884780884,
"logits/rejected": -1.2849574089050293,
"logps/chosen": -345.1870422363281,
"logps/rejected": -475.9234924316406,
"loss": 0.0144,
"rewards/accuracies": 0.9906250238418579,
"rewards/chosen": -11.819540977478027,
"rewards/margins": 11.19267463684082,
"rewards/rejected": -23.012216567993164,
"step": 300
},
{
"epoch": 0.054760087975223304,
"grad_norm": 1.828125,
"learning_rate": 1.6439764425419112e-05,
"logits/chosen": -1.274107813835144,
"logits/rejected": -1.2885282039642334,
"logps/chosen": -330.4737854003906,
"logps/rejected": -473.398681640625,
"loss": 0.0267,
"rewards/accuracies": 0.984375,
"rewards/chosen": -11.204086303710938,
"rewards/margins": 12.114290237426758,
"rewards/rejected": -23.318378448486328,
"step": 305
},
{
"epoch": 0.05565779433547287,
"grad_norm": 13.9375,
"learning_rate": 1.6439467251435852e-05,
"logits/chosen": -1.2366708517074585,
"logits/rejected": -1.2527769804000854,
"logps/chosen": -323.2499694824219,
"logps/rejected": -467.12890625,
"loss": 0.023,
"rewards/accuracies": 0.9906250238418579,
"rewards/chosen": -11.250632286071777,
"rewards/margins": 12.270976066589355,
"rewards/rejected": -23.521610260009766,
"step": 310
},
{
"epoch": 0.05655550069572243,
"grad_norm": 2.21875,
"learning_rate": 1.6439164609543545e-05,
"logits/chosen": -1.287007212638855,
"logits/rejected": -1.315598726272583,
"logps/chosen": -315.24200439453125,
"logps/rejected": -456.1424255371094,
"loss": 0.0308,
"rewards/accuracies": 0.981249988079071,
"rewards/chosen": -10.730131149291992,
"rewards/margins": 11.966339111328125,
"rewards/rejected": -22.696468353271484,
"step": 315
},
{
"epoch": 0.05745320705597199,
"grad_norm": 4.03125,
"learning_rate": 1.6438856500010842e-05,
"logits/chosen": -1.412188172340393,
"logits/rejected": -1.4369876384735107,
"logps/chosen": -304.62298583984375,
"logps/rejected": -442.72808837890625,
"loss": 0.0438,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -9.49343490600586,
"rewards/margins": 11.810284614562988,
"rewards/rejected": -21.303720474243164,
"step": 320
},
{
"epoch": 0.058350913416221555,
"grad_norm": 3.34375,
"learning_rate": 1.643854292311126e-05,
"logits/chosen": -1.4016748666763306,
"logits/rejected": -1.4312589168548584,
"logps/chosen": -321.81982421875,
"logps/rejected": -464.9969787597656,
"loss": 0.016,
"rewards/accuracies": 0.9906250238418579,
"rewards/chosen": -9.75760269165039,
"rewards/margins": 12.245366096496582,
"rewards/rejected": -22.00296974182129,
"step": 325
},
{
"epoch": 0.05924861977647112,
"grad_norm": 8.125,
"learning_rate": 1.6438223879123157e-05,
"logits/chosen": -1.420204520225525,
"logits/rejected": -1.4521286487579346,
"logps/chosen": -334.3448486328125,
"logps/rejected": -486.096435546875,
"loss": 0.0395,
"rewards/accuracies": 0.971875011920929,
"rewards/chosen": -11.95421028137207,
"rewards/margins": 13.074414253234863,
"rewards/rejected": -25.02862548828125,
"step": 330
},
{
"epoch": 0.060146326136720676,
"grad_norm": 7.4375,
"learning_rate": 1.6437899368329744e-05,
"logits/chosen": -1.4968700408935547,
"logits/rejected": -1.516984224319458,
"logps/chosen": -362.8880310058594,
"logps/rejected": -503.55047607421875,
"loss": 0.0443,
"rewards/accuracies": 0.96875,
"rewards/chosen": -13.416742324829102,
"rewards/margins": 12.286725997924805,
"rewards/rejected": -25.703466415405273,
"step": 335
},
{
"epoch": 0.06104403249697024,
"grad_norm": 3.65625,
"learning_rate": 1.643756939101909e-05,
"logits/chosen": -1.4715522527694702,
"logits/rejected": -1.500880479812622,
"logps/chosen": -347.5955505371094,
"logps/rejected": -490.23345947265625,
"loss": 0.0188,
"rewards/accuracies": 0.981249988079071,
"rewards/chosen": -13.808749198913574,
"rewards/margins": 12.222585678100586,
"rewards/rejected": -26.031335830688477,
"step": 340
},
{
"epoch": 0.061941738857219805,
"grad_norm": 0.0283203125,
"learning_rate": 1.6437233947484115e-05,
"logits/chosen": -1.4634774923324585,
"logits/rejected": -1.4903171062469482,
"logps/chosen": -346.1238708496094,
"logps/rejected": -480.40740966796875,
"loss": 0.026,
"rewards/accuracies": 0.9781249761581421,
"rewards/chosen": -13.692280769348145,
"rewards/margins": 11.461533546447754,
"rewards/rejected": -25.153812408447266,
"step": 345
},
{
"epoch": 0.06283944521746937,
"grad_norm": 2.734375,
"learning_rate": 1.6436893038022587e-05,
"logits/chosen": -1.4172029495239258,
"logits/rejected": -1.442546010017395,
"logps/chosen": -339.7004089355469,
"logps/rejected": -476.07861328125,
"loss": 0.0222,
"rewards/accuracies": 0.984375,
"rewards/chosen": -13.242881774902344,
"rewards/margins": 11.535491943359375,
"rewards/rejected": -24.77837562561035,
"step": 350
},
{
"epoch": 0.06373715157771893,
"grad_norm": 2.515625,
"learning_rate": 1.6436546662937136e-05,
"logits/chosen": -1.4132306575775146,
"logits/rejected": -1.438727855682373,
"logps/chosen": -340.2590637207031,
"logps/rejected": -478.34759521484375,
"loss": 0.03,
"rewards/accuracies": 0.981249988079071,
"rewards/chosen": -12.395490646362305,
"rewards/margins": 11.930428504943848,
"rewards/rejected": -24.325918197631836,
"step": 355
},
{
"epoch": 0.06463485793796848,
"grad_norm": 1.9609375,
"learning_rate": 1.6436194822535237e-05,
"logits/chosen": -1.3696801662445068,
"logits/rejected": -1.4052057266235352,
"logps/chosen": -332.30072021484375,
"logps/rejected": -476.0782165527344,
"loss": 0.0272,
"rewards/accuracies": 0.981249988079071,
"rewards/chosen": -12.618162155151367,
"rewards/margins": 12.268811225891113,
"rewards/rejected": -24.886974334716797,
"step": 360
},
{
"epoch": 0.06553256429821805,
"grad_norm": 1.421875,
"learning_rate": 1.643583751712921e-05,
"logits/chosen": -1.3992929458618164,
"logits/rejected": -1.4282127618789673,
"logps/chosen": -334.0565490722656,
"logps/rejected": -480.23785400390625,
"loss": 0.0185,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -11.715566635131836,
"rewards/margins": 12.651227951049805,
"rewards/rejected": -24.36679458618164,
"step": 365
},
{
"epoch": 0.06643027065846761,
"grad_norm": 5.1875,
"learning_rate": 1.6435474747036243e-05,
"logits/chosen": -1.453920602798462,
"logits/rejected": -1.4755427837371826,
"logps/chosen": -322.3940124511719,
"logps/rejected": -467.23126220703125,
"loss": 0.0371,
"rewards/accuracies": 0.9781249761581421,
"rewards/chosen": -11.112676620483398,
"rewards/margins": 12.558187484741211,
"rewards/rejected": -23.67086410522461,
"step": 370
},
{
"epoch": 0.06732797701871718,
"grad_norm": 3.84375,
"learning_rate": 1.643510651257836e-05,
"logits/chosen": -1.4459034204483032,
"logits/rejected": -1.4675936698913574,
"logps/chosen": -320.50347900390625,
"logps/rejected": -463.59014892578125,
"loss": 0.0144,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -10.755430221557617,
"rewards/margins": 12.185154914855957,
"rewards/rejected": -22.94058609008789,
"step": 375
},
{
"epoch": 0.06822568337896674,
"grad_norm": 0.0186767578125,
"learning_rate": 1.6434732814082442e-05,
"logits/chosen": -1.4478992223739624,
"logits/rejected": -1.4632583856582642,
"logps/chosen": -331.53289794921875,
"logps/rejected": -466.4090881347656,
"loss": 0.0205,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -10.466972351074219,
"rewards/margins": 11.554471969604492,
"rewards/rejected": -22.021446228027344,
"step": 380
},
{
"epoch": 0.0691233897392163,
"grad_norm": 5.21875,
"learning_rate": 1.6434353651880223e-05,
"logits/chosen": -1.4576263427734375,
"logits/rejected": -1.470090627670288,
"logps/chosen": -322.73773193359375,
"logps/rejected": -458.134521484375,
"loss": 0.0305,
"rewards/accuracies": 0.9781249761581421,
"rewards/chosen": -10.470837593078613,
"rewards/margins": 11.618741035461426,
"rewards/rejected": -22.089576721191406,
"step": 385
},
{
"epoch": 0.07002109609946587,
"grad_norm": 2.875,
"learning_rate": 1.643396902630828e-05,
"logits/chosen": -1.4584577083587646,
"logits/rejected": -1.4607679843902588,
"logps/chosen": -317.8088073730469,
"logps/rejected": -451.45440673828125,
"loss": 0.0322,
"rewards/accuracies": 0.9781249761581421,
"rewards/chosen": -10.205190658569336,
"rewards/margins": 11.55008316040039,
"rewards/rejected": -21.75527572631836,
"step": 390
},
{
"epoch": 0.07091880245971542,
"grad_norm": 6.96875,
"learning_rate": 1.6433578937708046e-05,
"logits/chosen": -1.4126781225204468,
"logits/rejected": -1.4239190816879272,
"logps/chosen": -322.39801025390625,
"logps/rejected": -458.4278259277344,
"loss": 0.0256,
"rewards/accuracies": 0.981249988079071,
"rewards/chosen": -9.876721382141113,
"rewards/margins": 11.59015941619873,
"rewards/rejected": -21.466880798339844,
"step": 395
},
{
"epoch": 0.07181650881996499,
"grad_norm": 0.02001953125,
"learning_rate": 1.64331833864258e-05,
"logits/chosen": -1.4148705005645752,
"logits/rejected": -1.4251186847686768,
"logps/chosen": -316.4896545410156,
"logps/rejected": -450.53240966796875,
"loss": 0.0164,
"rewards/accuracies": 0.9906250238418579,
"rewards/chosen": -9.625123023986816,
"rewards/margins": 11.543768882751465,
"rewards/rejected": -21.168895721435547,
"step": 400
},
{
"epoch": 0.07271421518021455,
"grad_norm": 0.890625,
"learning_rate": 1.643278237281267e-05,
"logits/chosen": -1.421555757522583,
"logits/rejected": -1.4265415668487549,
"logps/chosen": -308.0295104980469,
"logps/rejected": -445.5311584472656,
"loss": 0.0249,
"rewards/accuracies": 0.981249988079071,
"rewards/chosen": -9.17651653289795,
"rewards/margins": 11.756936073303223,
"rewards/rejected": -20.933452606201172,
"step": 405
},
{
"epoch": 0.07361192154046411,
"grad_norm": 2.140625,
"learning_rate": 1.6432375897224637e-05,
"logits/chosen": -1.3315099477767944,
"logits/rejected": -1.3350989818572998,
"logps/chosen": -315.9861755371094,
"logps/rejected": -451.1835021972656,
"loss": 0.044,
"rewards/accuracies": 0.981249988079071,
"rewards/chosen": -9.11551284790039,
"rewards/margins": 11.56936264038086,
"rewards/rejected": -20.68487548828125,
"step": 410
},
{
"epoch": 0.07450962790071368,
"grad_norm": 0.94140625,
"learning_rate": 1.6431963960022524e-05,
"logits/chosen": -1.2902719974517822,
"logits/rejected": -1.2910665273666382,
"logps/chosen": -309.9100341796875,
"logps/rejected": -442.4892578125,
"loss": 0.0278,
"rewards/accuracies": 0.984375,
"rewards/chosen": -9.109766006469727,
"rewards/margins": 11.337265968322754,
"rewards/rejected": -20.447031021118164,
"step": 415
},
{
"epoch": 0.07540733426096324,
"grad_norm": 4.28125,
"learning_rate": 1.643154656157201e-05,
"logits/chosen": -1.2414597272872925,
"logits/rejected": -1.2512853145599365,
"logps/chosen": -301.22174072265625,
"logps/rejected": -416.2430725097656,
"loss": 0.0651,
"rewards/accuracies": 0.971875011920929,
"rewards/chosen": -9.233153343200684,
"rewards/margins": 9.558819770812988,
"rewards/rejected": -18.79197120666504,
"step": 420
},
{
"epoch": 0.07630504062121281,
"grad_norm": 2.984375,
"learning_rate": 1.6431123702243618e-05,
"logits/chosen": -1.2505871057510376,
"logits/rejected": -1.2604036331176758,
"logps/chosen": -319.97369384765625,
"logps/rejected": -420.20184326171875,
"loss": 0.0256,
"rewards/accuracies": 0.981249988079071,
"rewards/chosen": -9.754773139953613,
"rewards/margins": 7.977179527282715,
"rewards/rejected": -17.731952667236328,
"step": 425
},
{
"epoch": 0.07720274698146236,
"grad_norm": 2.75,
"learning_rate": 1.6430695382412714e-05,
"logits/chosen": -1.2662450075149536,
"logits/rejected": -1.2877540588378906,
"logps/chosen": -322.77532958984375,
"logps/rejected": -426.742919921875,
"loss": 0.0443,
"rewards/accuracies": 0.984375,
"rewards/chosen": -10.8510103225708,
"rewards/margins": 8.357492446899414,
"rewards/rejected": -19.208499908447266,
"step": 430
},
{
"epoch": 0.07810045334171192,
"grad_norm": 0.039794921875,
"learning_rate": 1.6430261602459523e-05,
"logits/chosen": -1.291669487953186,
"logits/rejected": -1.3137457370758057,
"logps/chosen": -338.3268127441406,
"logps/rejected": -457.73077392578125,
"loss": 0.0177,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -12.022209167480469,
"rewards/margins": 10.057284355163574,
"rewards/rejected": -22.07949447631836,
"step": 435
},
{
"epoch": 0.07899815970196149,
"grad_norm": 7.125,
"learning_rate": 1.6429822362769104e-05,
"logits/chosen": -1.2740453481674194,
"logits/rejected": -1.2928683757781982,
"logps/chosen": -351.74859619140625,
"logps/rejected": -467.54345703125,
"loss": 0.0661,
"rewards/accuracies": 0.96875,
"rewards/chosen": -13.225227355957031,
"rewards/margins": 9.72153377532959,
"rewards/rejected": -22.946762084960938,
"step": 440
},
{
"epoch": 0.07989586606221105,
"grad_norm": 5.0625,
"learning_rate": 1.642937766373137e-05,
"logits/chosen": -1.2580888271331787,
"logits/rejected": -1.2826154232025146,
"logps/chosen": -356.80303955078125,
"logps/rejected": -462.6808166503906,
"loss": 0.05,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -13.879257202148438,
"rewards/margins": 8.64616584777832,
"rewards/rejected": -22.525421142578125,
"step": 445
},
{
"epoch": 0.08079357242246062,
"grad_norm": 3.53125,
"learning_rate": 1.6428927505741077e-05,
"logits/chosen": -1.3274773359298706,
"logits/rejected": -1.3538029193878174,
"logps/chosen": -351.6380920410156,
"logps/rejected": -460.96075439453125,
"loss": 0.0209,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -12.758635520935059,
"rewards/margins": 9.032510757446289,
"rewards/rejected": -21.79114532470703,
"step": 450
},
{
"epoch": 0.08169127878271018,
"grad_norm": 2.140625,
"learning_rate": 1.642847188919783e-05,
"logits/chosen": -1.3372552394866943,
"logits/rejected": -1.3623110055923462,
"logps/chosen": -347.4330139160156,
"logps/rejected": -465.4501037597656,
"loss": 0.0277,
"rewards/accuracies": 0.981249988079071,
"rewards/chosen": -12.52314281463623,
"rewards/margins": 9.909950256347656,
"rewards/rejected": -22.43309211730957,
"step": 455
},
{
"epoch": 0.08258898514295973,
"grad_norm": 5.5,
"learning_rate": 1.6428010814506082e-05,
"logits/chosen": -1.3123576641082764,
"logits/rejected": -1.339634895324707,
"logps/chosen": -338.0999450683594,
"logps/rejected": -456.74310302734375,
"loss": 0.0419,
"rewards/accuracies": 0.96875,
"rewards/chosen": -13.015867233276367,
"rewards/margins": 9.875136375427246,
"rewards/rejected": -22.891002655029297,
"step": 460
},
{
"epoch": 0.0834866915032093,
"grad_norm": 1.703125,
"learning_rate": 1.6427544282075123e-05,
"logits/chosen": -1.3849332332611084,
"logits/rejected": -1.4038164615631104,
"logps/chosen": -353.74908447265625,
"logps/rejected": -478.51287841796875,
"loss": 0.0462,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -13.985809326171875,
"rewards/margins": 10.343815803527832,
"rewards/rejected": -24.32962417602539,
"step": 465
},
{
"epoch": 0.08438439786345886,
"grad_norm": 0.1669921875,
"learning_rate": 1.642707229231909e-05,
"logits/chosen": -1.3579334020614624,
"logits/rejected": -1.3771896362304688,
"logps/chosen": -369.145751953125,
"logps/rejected": -490.97100830078125,
"loss": 0.0146,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -15.570086479187012,
"rewards/margins": 9.97675895690918,
"rewards/rejected": -25.546846389770508,
"step": 470
},
{
"epoch": 0.08528210422370842,
"grad_norm": 1.8203125,
"learning_rate": 1.6426594845656973e-05,
"logits/chosen": -1.355943202972412,
"logits/rejected": -1.3650095462799072,
"logps/chosen": -376.259521484375,
"logps/rejected": -499.44970703125,
"loss": 0.0148,
"rewards/accuracies": 0.9937499761581421,
"rewards/chosen": -15.661959648132324,
"rewards/margins": 10.384978294372559,
"rewards/rejected": -26.04693603515625,
"step": 475
},
{
"epoch": 0.08617981058395799,
"grad_norm": 2.265625,
"learning_rate": 1.642611194251259e-05,
"logits/chosen": -1.355452299118042,
"logits/rejected": -1.3569139242172241,
"logps/chosen": -376.9379577636719,
"logps/rejected": -506.89666748046875,
"loss": 0.0385,
"rewards/accuracies": 0.971875011920929,
"rewards/chosen": -15.602932929992676,
"rewards/margins": 10.967208862304688,
"rewards/rejected": -26.570140838623047,
"step": 480
},
{
"epoch": 0.08707751694420755,
"grad_norm": 1.71875,
"learning_rate": 1.642562358331462e-05,
"logits/chosen": -1.3472042083740234,
"logits/rejected": -1.3535155057907104,
"logps/chosen": -362.6130676269531,
"logps/rejected": -499.9092712402344,
"loss": 0.0131,
"rewards/accuracies": 0.987500011920929,
"rewards/chosen": -14.739013671875,
"rewards/margins": 11.621049880981445,
"rewards/rejected": -26.360065460205078,
"step": 485
},
{
"epoch": 0.08797522330445712,
"grad_norm": 1.1171875,
"learning_rate": 1.6425129768496577e-05,
"logits/chosen": -1.3245633840560913,
"logits/rejected": -1.3288484811782837,
"logps/chosen": -364.94903564453125,
"logps/rejected": -503.7896423339844,
"loss": 0.0225,
"rewards/accuracies": 0.984375,
"rewards/chosen": -14.195643424987793,
"rewards/margins": 11.918843269348145,
"rewards/rejected": -26.114486694335938,
"step": 490
},
{
"epoch": 0.08887292966470667,
"grad_norm": 3.65625,
"learning_rate": 1.6424630498496813e-05,
"logits/chosen": -1.3164643049240112,
"logits/rejected": -1.3220335245132446,
"logps/chosen": -365.0807189941406,
"logps/rejected": -510.30340576171875,
"loss": 0.0186,
"rewards/accuracies": 0.984375,
"rewards/chosen": -14.422874450683594,
"rewards/margins": 12.447381973266602,
"rewards/rejected": -26.870258331298828,
"step": 495
},
{
"epoch": 0.08977063602495623,
"grad_norm": 2.203125,
"learning_rate": 1.6424125773758535e-05,
"logits/chosen": -1.418001413345337,
"logits/rejected": -1.4166367053985596,
"logps/chosen": -361.4501037597656,
"logps/rejected": -499.55902099609375,
"loss": 0.0316,
"rewards/accuracies": 0.984375,
"rewards/chosen": -14.319913864135742,
"rewards/margins": 11.92530632019043,
"rewards/rejected": -26.245220184326172,
"step": 500
},
{
"epoch": 0.08977063602495623,
"eval_logits/chosen": -1.3433603048324585,
"eval_logits/rejected": -1.36442232131958,
"eval_logps/chosen": -369.4107666015625,
"eval_logps/rejected": -511.26239013671875,
"eval_loss": 0.014039273373782635,
"eval_rewards/accuracies": 0.9900000095367432,
"eval_rewards/chosen": -13.992281913757324,
"eval_rewards/margins": 12.298800468444824,
"eval_rewards/rejected": -26.29108238220215,
"eval_runtime": 10.3033,
"eval_samples_per_second": 19.411,
"eval_steps_per_second": 19.411,
"step": 500
}
],
"logging_steps": 5,
"max_steps": 16707,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}