Llama-3.1-8B-Instruct-KTO-700 / trainer_state.json
chchen's picture
End of training
f55036d verified
{
"best_metric": 0.21406607329845428,
"best_model_checkpoint": "saves/sycophancy/Llama-8B-3.1-Instruct/kto-700/checkpoint-700",
"epoch": 9.933333333333334,
"eval_steps": 50,
"global_step": 780,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.12698412698412698,
"grad_norm": 0.6788301467895508,
"kl": 4.387261390686035,
"learning_rate": 6.41025641025641e-07,
"logits/chosen": -5470348.467532467,
"logits/rejected": -7884086.746987952,
"logps/chosen": -18.302797788149352,
"logps/rejected": -20.305931734751507,
"loss": 0.5003,
"rewards/chosen": -0.0015607382376472672,
"rewards/margins": -0.002322281828985735,
"rewards/rejected": 0.0007615435913384679,
"step": 10
},
{
"epoch": 0.25396825396825395,
"grad_norm": 0.5719761252403259,
"kl": 5.789945602416992,
"learning_rate": 1.282051282051282e-06,
"logits/chosen": -5099802.810810811,
"logits/rejected": -7728657.860465116,
"logps/chosen": -15.63258815456081,
"logps/rejected": -19.22288335755814,
"loss": 0.4999,
"rewards/chosen": 0.0009517762306574228,
"rewards/margins": 0.0008202694979274445,
"rewards/rejected": 0.00013150673272997835,
"step": 20
},
{
"epoch": 0.38095238095238093,
"grad_norm": 1.0039992332458496,
"kl": 5.201549530029297,
"learning_rate": 1.9230769230769234e-06,
"logits/chosen": -6082714.074074074,
"logits/rejected": -6860618.53164557,
"logps/chosen": -18.337599766107253,
"logps/rejected": -18.856711085838608,
"loss": 0.5001,
"rewards/chosen": 0.0019511194140822798,
"rewards/margins": -0.0016700430789353758,
"rewards/rejected": 0.0036211624930176556,
"step": 30
},
{
"epoch": 0.5079365079365079,
"grad_norm": 0.8720445036888123,
"kl": 5.458462715148926,
"learning_rate": 2.564102564102564e-06,
"logits/chosen": -6005976.847058823,
"logits/rejected": -7850483.2,
"logps/chosen": -16.87245662913603,
"logps/rejected": -20.8893798828125,
"loss": 0.4999,
"rewards/chosen": 0.0022517660084892723,
"rewards/margins": 0.0017136495136747176,
"rewards/rejected": 0.0005381164948145548,
"step": 40
},
{
"epoch": 0.6349206349206349,
"grad_norm": 0.7587161064147949,
"kl": 5.623558044433594,
"learning_rate": 3.205128205128206e-06,
"logits/chosen": -6492087.466666667,
"logits/rejected": -6836711.152941177,
"logps/chosen": -17.93591796875,
"logps/rejected": -20.48527401194853,
"loss": 0.4998,
"rewards/chosen": 0.0073052302996317545,
"rewards/margins": 0.0005395582610485603,
"rewards/rejected": 0.006765672038583194,
"step": 50
},
{
"epoch": 0.6349206349206349,
"eval_logits/chosen": -6144470.349206349,
"eval_logits/rejected": -7140612.155844156,
"eval_logps/chosen": -15.744288853236608,
"eval_logps/rejected": -19.299322747564936,
"eval_loss": 0.4993804395198822,
"eval_rewards/chosen": 0.011447552650693863,
"eval_rewards/margins": 0.006427939363028236,
"eval_rewards/rejected": 0.0050196132876656275,
"eval_runtime": 28.9366,
"eval_samples_per_second": 4.838,
"eval_steps_per_second": 2.419,
"kl": 4.069240093231201,
"step": 50
},
{
"epoch": 0.7619047619047619,
"grad_norm": 0.7702682018280029,
"kl": 4.712460517883301,
"learning_rate": 3.846153846153847e-06,
"logits/chosen": -6175395.348837209,
"logits/rejected": -7327229.405405405,
"logps/chosen": -18.813822901526162,
"logps/rejected": -19.35801243137669,
"loss": 0.4996,
"rewards/chosen": 0.015079736709594727,
"rewards/margins": 0.0028353905355608146,
"rewards/rejected": 0.012244346174033912,
"step": 60
},
{
"epoch": 0.8888888888888888,
"grad_norm": 0.8070928454399109,
"kl": 5.428678512573242,
"learning_rate": 4.487179487179488e-06,
"logits/chosen": -5747504.898876404,
"logits/rejected": -7797529.23943662,
"logps/chosen": -16.341429292485955,
"logps/rejected": -20.183217223261444,
"loss": 0.4981,
"rewards/chosen": 0.02937422173746516,
"rewards/margins": 0.011152571412388481,
"rewards/rejected": 0.01822165032507668,
"step": 70
},
{
"epoch": 1.019047619047619,
"grad_norm": 0.9550174474716187,
"kl": 8.223318099975586,
"learning_rate": 4.999899863449631e-06,
"logits/chosen": -6264741.052631579,
"logits/rejected": -7049278.476190476,
"logps/chosen": -16.723974930612666,
"logps/rejected": -18.56858898344494,
"loss": 0.4976,
"rewards/chosen": 0.05320446114791067,
"rewards/margins": 0.022168612121639397,
"rewards/rejected": 0.031035849026271274,
"step": 80
},
{
"epoch": 1.146031746031746,
"grad_norm": 1.3274585008621216,
"kl": 13.437724113464355,
"learning_rate": 4.996395926410354e-06,
"logits/chosen": -7089864.091954023,
"logits/rejected": -7814464.876712329,
"logps/chosen": -14.959357601472702,
"logps/rejected": -19.25765196917808,
"loss": 0.4894,
"rewards/chosen": 0.11598257086742883,
"rewards/margins": 0.080287168277866,
"rewards/rejected": 0.03569540258956282,
"step": 90
},
{
"epoch": 1.273015873015873,
"grad_norm": 1.2156286239624023,
"kl": 15.28927993774414,
"learning_rate": 4.9878931808274796e-06,
"logits/chosen": -5328334.222222222,
"logits/rejected": -7435643.636363637,
"logps/chosen": -16.729354858398438,
"logps/rejected": -20.145260897549715,
"loss": 0.4897,
"rewards/chosen": 0.1613529788123237,
"rewards/margins": 0.09188530421016193,
"rewards/rejected": 0.06946767460216176,
"step": 100
},
{
"epoch": 1.273015873015873,
"eval_logits/chosen": -6027973.079365079,
"eval_logits/rejected": -7100376.103896104,
"eval_logps/chosen": -14.120830233134921,
"eval_logps/rejected": -19.052055854301948,
"eval_loss": 0.4845275282859802,
"eval_rewards/chosen": 0.17379347483317056,
"eval_rewards/margins": 0.1440471064992797,
"eval_rewards/rejected": 0.02974636833389084,
"eval_runtime": 29.0593,
"eval_samples_per_second": 4.818,
"eval_steps_per_second": 2.409,
"kl": 8.730419158935547,
"step": 100
},
{
"epoch": 1.4,
"grad_norm": 1.2826446294784546,
"kl": 16.87066078186035,
"learning_rate": 4.9744086526850724e-06,
"logits/chosen": -6426819.6,
"logits/rejected": -6980560.0,
"logps/chosen": -14.482907104492188,
"logps/rejected": -20.08314208984375,
"loss": 0.4695,
"rewards/chosen": 0.25329229831695554,
"rewards/margins": 0.2625655651092529,
"rewards/rejected": -0.009273266792297364,
"step": 110
},
{
"epoch": 1.5269841269841269,
"grad_norm": 1.266455054283142,
"kl": 9.237666130065918,
"learning_rate": 4.955969343539162e-06,
"logits/chosen": -5808524.273972603,
"logits/rejected": -7559982.344827586,
"logps/chosen": -15.187309369648972,
"logps/rejected": -21.856114179238507,
"loss": 0.4685,
"rewards/chosen": 0.2450432973365261,
"rewards/margins": 0.2776630093330211,
"rewards/rejected": -0.03261971199649504,
"step": 120
},
{
"epoch": 1.6539682539682539,
"grad_norm": 1.8095053434371948,
"kl": 5.514086723327637,
"learning_rate": 4.93261217644956e-06,
"logits/chosen": -5919740.651162791,
"logits/rejected": -6616346.810810811,
"logps/chosen": -15.901311830032704,
"logps/rejected": -23.59872848923142,
"loss": 0.4326,
"rewards/chosen": 0.22375809869100882,
"rewards/margins": 0.563330051510833,
"rewards/rejected": -0.3395719528198242,
"step": 130
},
{
"epoch": 1.7809523809523808,
"grad_norm": 2.0375888347625732,
"kl": 2.2235469818115234,
"learning_rate": 4.90438392204474e-06,
"logits/chosen": -6043188.043956044,
"logits/rejected": -6697519.304347826,
"logps/chosen": -14.64537366929945,
"logps/rejected": -23.984520069067027,
"loss": 0.402,
"rewards/chosen": 0.3531372573349502,
"rewards/margins": 0.8639071981396944,
"rewards/rejected": -0.5107699408047441,
"step": 140
},
{
"epoch": 1.9079365079365078,
"grad_norm": 1.6195893287658691,
"kl": 4.571013927459717,
"learning_rate": 4.8713411048678635e-06,
"logits/chosen": -3909229.4492753623,
"logits/rejected": -7315652.923076923,
"logps/chosen": -15.875658118206522,
"logps/rejected": -24.993848192822803,
"loss": 0.3937,
"rewards/chosen": 0.2891683163850204,
"rewards/margins": 0.8816854565324705,
"rewards/rejected": -0.5925171401474502,
"step": 150
},
{
"epoch": 1.9079365079365078,
"eval_logits/chosen": -5056723.301587301,
"eval_logits/rejected": -6579766.025974026,
"eval_logps/chosen": -12.606183733258929,
"eval_logps/rejected": -26.850310090300326,
"eval_loss": 0.37451449036598206,
"eval_rewards/chosen": 0.3252580128018818,
"eval_rewards/margins": 1.0753370810724783,
"eval_rewards/rejected": -0.7500790682705966,
"eval_runtime": 28.8826,
"eval_samples_per_second": 4.847,
"eval_steps_per_second": 2.424,
"kl": 0.8561515808105469,
"step": 150
},
{
"epoch": 2.038095238095238,
"grad_norm": 1.7494949102401733,
"kl": 0.0,
"learning_rate": 4.83354989019146e-06,
"logits/chosen": -4793371.826086956,
"logits/rejected": -6571360.94117647,
"logps/chosen": -12.75027200450068,
"logps/rejected": -29.481036017922793,
"loss": 0.3622,
"rewards/chosen": 0.4139024900353473,
"rewards/margins": 1.3404797775971005,
"rewards/rejected": -0.9265772875617532,
"step": 160
},
{
"epoch": 2.165079365079365,
"grad_norm": 1.6517891883850098,
"kl": 0.9644966125488281,
"learning_rate": 4.791085951527408e-06,
"logits/chosen": -5720041.788235294,
"logits/rejected": -6800391.253333333,
"logps/chosen": -14.041024241727941,
"logps/rejected": -31.237737630208333,
"loss": 0.3456,
"rewards/chosen": 0.43515203139361214,
"rewards/margins": 1.515641024720435,
"rewards/rejected": -1.080488993326823,
"step": 170
},
{
"epoch": 2.292063492063492,
"grad_norm": 2.6275811195373535,
"kl": 4.330084800720215,
"learning_rate": 4.744034319097536e-06,
"logits/chosen": -5309098.366197183,
"logits/rejected": -6363057.617977528,
"logps/chosen": -12.973335373569542,
"logps/rejected": -34.70361876755618,
"loss": 0.29,
"rewards/chosen": 0.5134559953716439,
"rewards/margins": 2.0540087132877254,
"rewards/rejected": -1.5405527179160814,
"step": 180
},
{
"epoch": 2.419047619047619,
"grad_norm": 1.039764642715454,
"kl": 0.5446796417236328,
"learning_rate": 4.692489209568234e-06,
"logits/chosen": -5042669.6,
"logits/rejected": -7686574.4,
"logps/chosen": -10.94500503540039,
"logps/rejected": -37.45916748046875,
"loss": 0.2867,
"rewards/chosen": 0.6056368827819825,
"rewards/margins": 2.4049492835998536,
"rewards/rejected": -1.7993124008178711,
"step": 190
},
{
"epoch": 2.546031746031746,
"grad_norm": 4.989805221557617,
"kl": 3.305936813354492,
"learning_rate": 4.636553837390051e-06,
"logits/chosen": -3932123.4285714286,
"logits/rejected": -6716783.036144578,
"logps/chosen": -14.493068942775974,
"logps/rejected": -43.533367846385545,
"loss": 0.2821,
"rewards/chosen": 0.3073018061650264,
"rewards/margins": 2.6910732199451473,
"rewards/rejected": -2.3837714137801207,
"step": 200
},
{
"epoch": 2.546031746031746,
"eval_logits/chosen": -4519878.603174604,
"eval_logits/rejected": -6449476.571428572,
"eval_logps/chosen": -12.711128355964782,
"eval_logps/rejected": -41.09160980620941,
"eval_loss": 0.27482813596725464,
"eval_rewards/chosen": 0.31476356869652156,
"eval_rewards/margins": 2.4889729652569925,
"eval_rewards/rejected": -2.174209396560471,
"eval_runtime": 28.943,
"eval_samples_per_second": 4.837,
"eval_steps_per_second": 2.419,
"kl": 0.0,
"step": 200
},
{
"epoch": 2.6730158730158733,
"grad_norm": 1.1182432174682617,
"kl": 0.0,
"learning_rate": 4.5763402081200295e-06,
"logits/chosen": -4791521.156626506,
"logits/rejected": -7367645.090909091,
"logps/chosen": -12.727749376411898,
"logps/rejected": -46.403529575892854,
"loss": 0.2666,
"rewards/chosen": 0.36001044583607866,
"rewards/margins": 3.122220251411363,
"rewards/rejected": -2.762209805575284,
"step": 210
},
{
"epoch": 2.8,
"grad_norm": 1.5499324798583984,
"kl": 0.0,
"learning_rate": 4.511968894140639e-06,
"logits/chosen": -3259810.835443038,
"logits/rejected": -7566860.641975309,
"logps/chosen": -13.120756897745252,
"logps/rejected": -44.52433871045525,
"loss": 0.2741,
"rewards/chosen": 0.44851534879660304,
"rewards/margins": 2.8181276775967574,
"rewards/rejected": -2.3696123288001543,
"step": 220
},
{
"epoch": 2.9269841269841272,
"grad_norm": 1.0529024600982666,
"kl": 8.710567474365234,
"learning_rate": 4.443568793224415e-06,
"logits/chosen": -4267431.804878049,
"logits/rejected": -7291750.564102564,
"logps/chosen": -12.697584198742378,
"logps/rejected": -43.92189065004006,
"loss": 0.2893,
"rewards/chosen": 0.34749545120611425,
"rewards/margins": 2.7181909076864827,
"rewards/rejected": -2.3706954564803686,
"step": 230
},
{
"epoch": 3.057142857142857,
"grad_norm": 3.3076581954956055,
"kl": 0.0,
"learning_rate": 4.3712768704277535e-06,
"logits/chosen": -3580050.337078652,
"logits/rejected": -7818372.507042253,
"logps/chosen": -10.8568588428283,
"logps/rejected": -48.93149964238556,
"loss": 0.2325,
"rewards/chosen": 0.8793054644981128,
"rewards/margins": 3.6636493451649264,
"rewards/rejected": -2.7843438806668135,
"step": 240
},
{
"epoch": 3.1841269841269844,
"grad_norm": 0.7128299474716187,
"kl": 0.0,
"learning_rate": 4.2952378838306855e-06,
"logits/chosen": -4378369.263157895,
"logits/rejected": -6322734.095238095,
"logps/chosen": -10.363418579101562,
"logps/rejected": -55.15029761904762,
"loss": 0.2081,
"rewards/chosen": 0.657376439947831,
"rewards/margins": 4.1836971244716405,
"rewards/rejected": -3.5263206845238093,
"step": 250
},
{
"epoch": 3.1841269841269844,
"eval_logits/chosen": -4612124.952380952,
"eval_logits/rejected": -6554266.597402598,
"eval_logps/chosen": -12.67233615451389,
"eval_logps/rejected": -53.95966289569805,
"eval_loss": 0.2370194047689438,
"eval_rewards/chosen": 0.31864278278653585,
"eval_rewards/margins": 3.7796571712301232,
"eval_rewards/rejected": -3.4610143884435876,
"eval_runtime": 28.9457,
"eval_samples_per_second": 4.837,
"eval_steps_per_second": 2.418,
"kl": 0.0,
"step": 250
},
{
"epoch": 3.311111111111111,
"grad_norm": 2.5890753269195557,
"kl": 0.0,
"learning_rate": 4.215604094671835e-06,
"logits/chosen": -4538084.512820513,
"logits/rejected": -7292885.853658536,
"logps/chosen": -14.071156037159454,
"logps/rejected": -56.76498189786585,
"loss": 0.2585,
"rewards/chosen": 0.23726805662497497,
"rewards/margins": 3.8709395147995176,
"rewards/rejected": -3.633671458174543,
"step": 260
},
{
"epoch": 3.4380952380952383,
"grad_norm": 2.1079933643341064,
"kl": 0.0,
"learning_rate": 4.1325349624589625e-06,
"logits/chosen": -5173939.9529411765,
"logits/rejected": -6703439.36,
"logps/chosen": -11.632867072610294,
"logps/rejected": -67.75546875,
"loss": 0.231,
"rewards/chosen": 0.6439653284409467,
"rewards/margins": 5.392097652659697,
"rewards/rejected": -4.74813232421875,
"step": 270
},
{
"epoch": 3.565079365079365,
"grad_norm": 1.217970609664917,
"kl": 0.0,
"learning_rate": 4.046196825665638e-06,
"logits/chosen": -5146073.518987342,
"logits/rejected": -6300332.641975309,
"logps/chosen": -11.147742935373813,
"logps/rejected": -71.35388937114197,
"loss": 0.1961,
"rewards/chosen": 0.5749296840233139,
"rewards/margins": 5.709579266874393,
"rewards/rejected": -5.13464958285108,
"step": 280
},
{
"epoch": 3.6920634920634923,
"grad_norm": 1.1603816747665405,
"kl": 0.0,
"learning_rate": 3.956762568653378e-06,
"logits/chosen": -4096646.0235294118,
"logits/rejected": -6349589.333333333,
"logps/chosen": -13.472160788143382,
"logps/rejected": -70.83629557291667,
"loss": 0.2347,
"rewards/chosen": 0.394403076171875,
"rewards/margins": 5.51095947265625,
"rewards/rejected": -5.116556396484375,
"step": 290
},
{
"epoch": 3.819047619047619,
"grad_norm": 9.697389602661133,
"kl": 0.0,
"learning_rate": 3.8644112754862614e-06,
"logits/chosen": -4819242.271604938,
"logits/rejected": -6257292.151898734,
"logps/chosen": -17.26012279369213,
"logps/rejected": -67.79164606408227,
"loss": 0.2651,
"rewards/chosen": 0.10824219974470728,
"rewards/margins": 4.995783765458711,
"rewards/rejected": -4.887541565714003,
"step": 300
},
{
"epoch": 3.819047619047619,
"eval_logits/chosen": -4312815.746031746,
"eval_logits/rejected": -6286123.220779221,
"eval_logps/chosen": -15.552150181361608,
"eval_logps/rejected": -72.25813590706169,
"eval_loss": 0.22668065130710602,
"eval_rewards/chosen": 0.030661461845276846,
"eval_rewards/margins": 5.321523309958102,
"eval_rewards/rejected": -5.290861848112825,
"eval_runtime": 28.9019,
"eval_samples_per_second": 4.844,
"eval_steps_per_second": 2.422,
"kl": 0.0,
"step": 300
},
{
"epoch": 3.9460317460317462,
"grad_norm": 2.1276402473449707,
"kl": 0.0,
"learning_rate": 3.76932787133117e-06,
"logits/chosen": -5180418.157303371,
"logits/rejected": -6265091.605633803,
"logps/chosen": -13.515192953388343,
"logps/rejected": -74.84758747799296,
"loss": 0.2341,
"rewards/chosen": 0.4038938458046217,
"rewards/margins": 6.011258124248724,
"rewards/rejected": -5.607364278444102,
"step": 310
},
{
"epoch": 4.076190476190476,
"grad_norm": 2.4103806018829346,
"kl": 0.0,
"learning_rate": 3.6717027521617593e-06,
"logits/chosen": -5085308.097560976,
"logits/rejected": -6567972.512820513,
"logps/chosen": -9.632892143435594,
"logps/rejected": -75.03093699919872,
"loss": 0.1853,
"rewards/chosen": 0.7703150307259908,
"rewards/margins": 6.228486777395662,
"rewards/rejected": -5.458171746669671,
"step": 320
},
{
"epoch": 4.203174603174603,
"grad_norm": 1.1956801414489746,
"kl": 0.0,
"learning_rate": 3.5717314035076355e-06,
"logits/chosen": -4886774.588235294,
"logits/rejected": -6263912.96,
"logps/chosen": -12.183177274816176,
"logps/rejected": -80.09672526041666,
"loss": 0.1884,
"rewards/chosen": 0.5844921336454504,
"rewards/margins": 6.655275418150659,
"rewards/rejected": -6.070783284505208,
"step": 330
},
{
"epoch": 4.33015873015873,
"grad_norm": 1.5028984546661377,
"kl": 0.0,
"learning_rate": 3.4696140090121377e-06,
"logits/chosen": -3578742.436781609,
"logits/rejected": -6682695.890410959,
"logps/chosen": -11.774405643857758,
"logps/rejected": -76.78813944777397,
"loss": 0.2051,
"rewards/chosen": 0.5974952434671337,
"rewards/margins": 6.330363176766578,
"rewards/rejected": -5.732867933299444,
"step": 340
},
{
"epoch": 4.457142857142857,
"grad_norm": 0.1931433230638504,
"kl": 0.0,
"learning_rate": 3.3655550495825824e-06,
"logits/chosen": -4233703.2,
"logits/rejected": -6651906.4,
"logps/chosen": -9.5177734375,
"logps/rejected": -78.2944580078125,
"loss": 0.194,
"rewards/chosen": 0.690055513381958,
"rewards/margins": 6.46673321723938,
"rewards/rejected": -5.7766777038574215,
"step": 350
},
{
"epoch": 4.457142857142857,
"eval_logits/chosen": -4256054.349206349,
"eval_logits/rejected": -6266110.337662337,
"eval_logps/chosen": -15.28530544704861,
"eval_logps/rejected": -73.9473924512987,
"eval_loss": 0.22176755964756012,
"eval_rewards/chosen": 0.057345905001201326,
"eval_rewards/margins": 5.517134041669221,
"eval_rewards/rejected": -5.45978813666802,
"eval_runtime": 28.8957,
"eval_samples_per_second": 4.845,
"eval_steps_per_second": 2.423,
"kl": 0.0,
"step": 350
},
{
"epoch": 4.584126984126984,
"grad_norm": 1.0705475807189941,
"kl": 0.0,
"learning_rate": 3.2597628939356174e-06,
"logits/chosen": -5245969.777777778,
"logits/rejected": -6771841.454545454,
"logps/chosen": -16.555372450086807,
"logps/rejected": -78.77862548828125,
"loss": 0.2249,
"rewards/chosen": -0.15620039569007027,
"rewards/margins": 5.747189274942032,
"rewards/rejected": -5.9033896706321025,
"step": 360
},
{
"epoch": 4.711111111111111,
"grad_norm": 1.4375556707382202,
"kl": 0.0,
"learning_rate": 3.1524493813575936e-06,
"logits/chosen": -4801529.518987342,
"logits/rejected": -6302219.0617283955,
"logps/chosen": -13.753037851068038,
"logps/rejected": -85.18824749228395,
"loss": 0.181,
"rewards/chosen": 0.5262709508968305,
"rewards/margins": 7.078058904214732,
"rewards/rejected": -6.551787953317901,
"step": 370
},
{
"epoch": 4.838095238095238,
"grad_norm": 2.527923822402954,
"kl": 0.0,
"learning_rate": 3.043829397515419e-06,
"logits/chosen": -4641179.368421053,
"logits/rejected": -6583240.380952381,
"logps/chosen": -15.901164807771382,
"logps/rejected": -83.90581984747024,
"loss": 0.2053,
"rewards/chosen": 0.2012840572156404,
"rewards/margins": 6.545129969604034,
"rewards/rejected": -6.343845912388393,
"step": 380
},
{
"epoch": 4.965079365079365,
"grad_norm": 2.1507320404052734,
"kl": 1.0980682373046875,
"learning_rate": 2.9341204441673267e-06,
"logits/chosen": -4626031.816091954,
"logits/rejected": -5921111.671232876,
"logps/chosen": -18.242044383081897,
"logps/rejected": -79.91437687285959,
"loss": 0.2487,
"rewards/chosen": 0.032672991697815644,
"rewards/margins": 5.970230682461942,
"rewards/rejected": -5.937557690764127,
"step": 390
},
{
"epoch": 5.095238095238095,
"grad_norm": 0.8530715107917786,
"kl": 0.0,
"learning_rate": 2.8235422036351384e-06,
"logits/chosen": -3311293.263157895,
"logits/rejected": -6519166.476190476,
"logps/chosen": -10.424122860557155,
"logps/rejected": -86.8595958891369,
"loss": 0.168,
"rewards/chosen": 0.7402381395038805,
"rewards/margins": 7.611847781895993,
"rewards/rejected": -6.871609642392113,
"step": 400
},
{
"epoch": 5.095238095238095,
"eval_logits/chosen": -4093047.365079365,
"eval_logits/rejected": -6134765.714285715,
"eval_logps/chosen": -16.138509114583332,
"eval_logps/rejected": -79.63079596185065,
"eval_loss": 0.22183124721050262,
"eval_rewards/chosen": -0.027974476889958456,
"eval_rewards/margins": 6.000154010833256,
"eval_rewards/rejected": -6.028128487723214,
"eval_runtime": 28.9036,
"eval_samples_per_second": 4.844,
"eval_steps_per_second": 2.422,
"kl": 0.0,
"step": 400
},
{
"epoch": 5.222222222222222,
"grad_norm": 0.8624681830406189,
"kl": 5.43211555480957,
"learning_rate": 2.7123160989101623e-06,
"logits/chosen": -3609233.095890411,
"logits/rejected": -7120320.0,
"logps/chosen": -15.152233385059931,
"logps/rejected": -86.3641006645115,
"loss": 0.1736,
"rewards/chosen": 0.36869326029738336,
"rewards/margins": 7.0947001299094525,
"rewards/rejected": -6.726006869612069,
"step": 410
},
{
"epoch": 5.349206349206349,
"grad_norm": 0.0985303744673729,
"kl": 0.0,
"learning_rate": 2.6006648502735384e-06,
"logits/chosen": -4783202.666666667,
"logits/rejected": -5948822.736842105,
"logps/chosen": -11.800664992559524,
"logps/rejected": -89.84853001644737,
"loss": 0.1953,
"rewards/chosen": 0.5908058257330031,
"rewards/margins": 7.559863410796737,
"rewards/rejected": -6.969057585063734,
"step": 420
},
{
"epoch": 5.476190476190476,
"grad_norm": 1.2574081420898438,
"kl": 0.0,
"learning_rate": 2.4888120293188915e-06,
"logits/chosen": -4590456.746666667,
"logits/rejected": -6820007.152941177,
"logps/chosen": -17.311272786458332,
"logps/rejected": -84.2857479319853,
"loss": 0.2004,
"rewards/chosen": 0.06162989298502604,
"rewards/margins": 6.595223642985026,
"rewards/rejected": -6.53359375,
"step": 430
},
{
"epoch": 5.603174603174603,
"grad_norm": 0.8560766577720642,
"kl": 0.0,
"learning_rate": 2.376981611270305e-06,
"logits/chosen": -5536871.48051948,
"logits/rejected": -5756388.626506024,
"logps/chosen": -14.159840769581981,
"logps/rejected": -91.81920651355422,
"loss": 0.1843,
"rewards/chosen": 0.1429041949185458,
"rewards/margins": 7.223366914939254,
"rewards/rejected": -7.080462720020708,
"step": 440
},
{
"epoch": 5.73015873015873,
"grad_norm": 0.9085766673088074,
"kl": 0.0,
"learning_rate": 2.265397526492052e-06,
"logits/chosen": -3770334.7848101268,
"logits/rejected": -6611462.320987654,
"logps/chosen": -16.884657461431964,
"logps/rejected": -86.77527006172839,
"loss": 0.2268,
"rewards/chosen": -0.03714804106120822,
"rewards/margins": 6.53030560236086,
"rewards/rejected": -6.567453643422068,
"step": 450
},
{
"epoch": 5.73015873015873,
"eval_logits/chosen": -4156519.111111111,
"eval_logits/rejected": -6205292.883116883,
"eval_logps/chosen": -17.346346416170636,
"eval_logps/rejected": -87.04986810064935,
"eval_loss": 0.2162507325410843,
"eval_rewards/chosen": -0.14875823732406374,
"eval_rewards/margins": 6.6212763545489075,
"eval_rewards/rejected": -6.770034591872971,
"eval_runtime": 28.93,
"eval_samples_per_second": 4.839,
"eval_steps_per_second": 2.42,
"kl": 0.0,
"step": 450
},
{
"epoch": 5.857142857142857,
"grad_norm": 0.10580164194107056,
"kl": 0.0,
"learning_rate": 2.154283212088168e-06,
"logits/chosen": -5665110.476190476,
"logits/rejected": -7303102.315789473,
"logps/chosen": -18.501912434895832,
"logps/rejected": -97.78473221628289,
"loss": 0.1947,
"rewards/chosen": -0.05397450356256394,
"rewards/margins": 7.66953945100158,
"rewards/rejected": -7.723513954564145,
"step": 460
},
{
"epoch": 5.984126984126984,
"grad_norm": 11.131406784057617,
"kl": 0.0,
"learning_rate": 2.0438611644897186e-06,
"logits/chosen": -5625277.9130434785,
"logits/rejected": -5779500.235294118,
"logps/chosen": -12.774372930112092,
"logps/rejected": -83.66164263556985,
"loss": 0.2255,
"rewards/chosen": 0.533203995746115,
"rewards/margins": 6.9899062842054445,
"rewards/rejected": -6.456702288459329,
"step": 470
},
{
"epoch": 6.114285714285714,
"grad_norm": 1.4914944171905518,
"kl": 0.0,
"learning_rate": 1.934352493925695e-06,
"logits/chosen": -4793293.268292683,
"logits/rejected": -6536903.384615385,
"logps/chosen": -11.555212997808688,
"logps/rejected": -92.60106169871794,
"loss": 0.1766,
"rewards/chosen": 0.5902807654404059,
"rewards/margins": 7.9360464906006625,
"rewards/rejected": -7.345765725160256,
"step": 480
},
{
"epoch": 6.241269841269841,
"grad_norm": 0.13757291436195374,
"kl": 0.0,
"learning_rate": 1.8259764816696413e-06,
"logits/chosen": -3754589.5384615385,
"logits/rejected": -7616852.292682927,
"logps/chosen": -11.699952736879007,
"logps/rejected": -88.94584246379573,
"loss": 0.1854,
"rewards/chosen": 0.593261474218124,
"rewards/margins": 7.3913400577261275,
"rewards/rejected": -6.798078583508003,
"step": 490
},
{
"epoch": 6.368253968253969,
"grad_norm": 0.5682312846183777,
"kl": 0.0,
"learning_rate": 1.7189501409486061e-06,
"logits/chosen": -5201961.558441559,
"logits/rejected": -6742080.7710843375,
"logps/chosen": -16.54386921672078,
"logps/rejected": -90.30962914156626,
"loss": 0.1915,
"rewards/chosen": 0.06833470332158076,
"rewards/margins": 6.907243071167966,
"rewards/rejected": -6.838908367846385,
"step": 500
},
{
"epoch": 6.368253968253969,
"eval_logits/chosen": -4369097.650793651,
"eval_logits/rejected": -6383271.064935065,
"eval_logps/chosen": -16.69223942832341,
"eval_logps/rejected": -84.2469434862013,
"eval_loss": 0.21938754618167877,
"eval_rewards/chosen": -0.08334738110739087,
"eval_rewards/margins": 6.406394749320775,
"eval_rewards/rejected": -6.489742130428166,
"eval_runtime": 28.9437,
"eval_samples_per_second": 4.837,
"eval_steps_per_second": 2.418,
"kl": 0.0,
"step": 500
},
{
"epoch": 6.495238095238095,
"grad_norm": 1.1735241413116455,
"kl": 0.0,
"learning_rate": 1.613487782393661e-06,
"logits/chosen": -3442194.8235294116,
"logits/rejected": -6263149.653333333,
"logps/chosen": -10.886692181755516,
"logps/rejected": -93.58182291666667,
"loss": 0.176,
"rewards/chosen": 0.7260306414435892,
"rewards/margins": 8.333493206547756,
"rewards/rejected": -7.607462565104167,
"step": 510
},
{
"epoch": 6.622222222222222,
"grad_norm": 0.7961702942848206,
"kl": 0.0,
"learning_rate": 1.509800584902108e-06,
"logits/chosen": -3291153.3333333335,
"logits/rejected": -6859345.454545454,
"logps/chosen": -17.832650078667534,
"logps/rejected": -86.62845680930398,
"loss": 0.2017,
"rewards/chosen": -0.013874345355563693,
"rewards/margins": 6.7318663717520355,
"rewards/rejected": -6.7457407171076,
"step": 520
},
{
"epoch": 6.749206349206349,
"grad_norm": 0.05785604566335678,
"kl": 0.0,
"learning_rate": 1.4080961727707185e-06,
"logits/chosen": -5060515.084337349,
"logits/rejected": -5728966.64935065,
"logps/chosen": -16.07660426863705,
"logps/rejected": -87.74072899756493,
"loss": 0.2131,
"rewards/chosen": 0.19671148277190795,
"rewards/margins": 6.894206980438304,
"rewards/rejected": -6.697495497666396,
"step": 530
},
{
"epoch": 6.876190476190477,
"grad_norm": 0.3619508147239685,
"kl": 0.0,
"learning_rate": 1.3085781999467303e-06,
"logits/chosen": -5035710.608695652,
"logits/rejected": -7935128.470588235,
"logps/chosen": -9.16219827403193,
"logps/rejected": -96.11182358685662,
"loss": 0.1709,
"rewards/chosen": 0.8181397811226223,
"rewards/margins": 8.408293194783008,
"rewards/rejected": -7.590153413660386,
"step": 540
},
{
"epoch": 7.006349206349206,
"grad_norm": 0.5851492285728455,
"kl": 0.0,
"learning_rate": 1.2114459422291205e-06,
"logits/chosen": -4834725.052631579,
"logits/rejected": -6980896.761904762,
"logps/chosen": -20.62944914165296,
"logps/rejected": -96.27944800967262,
"loss": 0.201,
"rewards/chosen": -0.46881359501888875,
"rewards/margins": 7.173670797419728,
"rewards/rejected": -7.642484392438616,
"step": 550
},
{
"epoch": 7.006349206349206,
"eval_logits/chosen": -4304407.365079365,
"eval_logits/rejected": -6343543.688311689,
"eval_logps/chosen": -17.056785946800595,
"eval_logps/rejected": -86.64208350243507,
"eval_loss": 0.21892935037612915,
"eval_rewards/chosen": -0.11980220249720983,
"eval_rewards/margins": 6.6094553563501925,
"eval_rewards/rejected": -6.729257558847403,
"eval_runtime": 28.9259,
"eval_samples_per_second": 4.84,
"eval_steps_per_second": 2.42,
"kl": 0.0,
"step": 550
},
{
"epoch": 7.133333333333334,
"grad_norm": 0.6484764814376831,
"kl": 0.0,
"learning_rate": 1.1168938982367162e-06,
"logits/chosen": -4232963.555555556,
"logits/rejected": -8014400.0,
"logps/chosen": -18.053126205632715,
"logps/rejected": -94.37042622626582,
"loss": 0.1951,
"rewards/chosen": 0.06175481537241995,
"rewards/margins": 7.504841463661581,
"rewards/rejected": -7.443086648289161,
"step": 560
},
{
"epoch": 7.26031746031746,
"grad_norm": 1.4253727197647095,
"kl": 0.0,
"learning_rate": 1.0251113999421936e-06,
"logits/chosen": -5810530.157303371,
"logits/rejected": -5309918.647887324,
"logps/chosen": -14.917400917310394,
"logps/rejected": -89.89147777288733,
"loss": 0.2343,
"rewards/chosen": 0.2091124030981171,
"rewards/margins": 7.287517649026814,
"rewards/rejected": -7.078405245928697,
"step": 570
},
{
"epoch": 7.387301587301588,
"grad_norm": 0.9377846121788025,
"kl": 0.0,
"learning_rate": 9.362822335518062e-07,
"logits/chosen": -4028119.8048780486,
"logits/rejected": -6628607.58974359,
"logps/chosen": -8.946031523913872,
"logps/rejected": -93.52982271634616,
"loss": 0.1676,
"rewards/chosen": 0.678685444157298,
"rewards/margins": 8.047686783800728,
"rewards/rejected": -7.36900133964343,
"step": 580
},
{
"epoch": 7.514285714285714,
"grad_norm": 0.4060008227825165,
"kl": 0.0,
"learning_rate": 8.505842714900298e-07,
"logits/chosen": -5100800.831168831,
"logits/rejected": -7023735.518072289,
"logps/chosen": -14.822768174208603,
"logps/rejected": -92.95827842620481,
"loss": 0.1884,
"rewards/chosen": 0.11542927135120738,
"rewards/margins": 7.41691573478306,
"rewards/rejected": -7.301486463431853,
"step": 590
},
{
"epoch": 7.641269841269842,
"grad_norm": 0.44603800773620605,
"kl": 0.0,
"learning_rate": 7.681891162260016e-07,
"logits/chosen": -4478451.358024691,
"logits/rejected": -7117348.455696203,
"logps/chosen": -18.77003460165895,
"logps/rejected": -97.46780434137658,
"loss": 0.1961,
"rewards/chosen": 0.0032680240678198543,
"rewards/margins": 7.678102070250573,
"rewards/rejected": -7.674834046182753,
"step": 600
},
{
"epoch": 7.641269841269842,
"eval_logits/chosen": -4324965.079365079,
"eval_logits/rejected": -6328156.675324676,
"eval_logps/chosen": -16.93195258246528,
"eval_logps/rejected": -86.6986924208604,
"eval_loss": 0.2157014012336731,
"eval_rewards/chosen": -0.10731880248539032,
"eval_rewards/margins": 6.627598379941558,
"eval_rewards/rejected": -6.7349171824269485,
"eval_runtime": 28.9184,
"eval_samples_per_second": 4.841,
"eval_steps_per_second": 2.421,
"kl": 0.0,
"step": 600
},
{
"epoch": 7.768253968253968,
"grad_norm": 0.1919259876012802,
"kl": 0.0,
"learning_rate": 6.892617566550044e-07,
"logits/chosen": -4999124.8,
"logits/rejected": -7450809.6,
"logps/chosen": -15.9777099609375,
"logps/rejected": -94.30171508789063,
"loss": 0.1997,
"rewards/chosen": 0.05527666211128235,
"rewards/margins": 7.558959370851516,
"rewards/rejected": -7.503682708740234,
"step": 610
},
{
"epoch": 7.895238095238096,
"grad_norm": 0.4005780518054962,
"kl": 0.0,
"learning_rate": 6.139602377230247e-07,
"logits/chosen": -5132258.835443038,
"logits/rejected": -5855081.086419753,
"logps/chosen": -9.797328514388845,
"logps/rejected": -91.02250916280865,
"loss": 0.1697,
"rewards/chosen": 0.7228279113769531,
"rewards/margins": 7.915330533628111,
"rewards/rejected": -7.192502622251157,
"step": 620
},
{
"epoch": 8.025396825396825,
"grad_norm": 0.36264705657958984,
"kl": 0.0,
"learning_rate": 5.424353439559446e-07,
"logits/chosen": -5221032.8,
"logits/rejected": -7366360.0,
"logps/chosen": -14.4659912109375,
"logps/rejected": -95.17356567382812,
"loss": 0.1683,
"rewards/chosen": 0.5725198745727539,
"rewards/margins": 7.98663272857666,
"rewards/rejected": -7.414112854003906,
"step": 630
},
{
"epoch": 8.152380952380952,
"grad_norm": 0.14060212671756744,
"kl": 0.0,
"learning_rate": 4.748302975270838e-07,
"logits/chosen": -5158584.98630137,
"logits/rejected": -7203751.724137931,
"logps/chosen": -8.887396825502996,
"logps/rejected": -91.2306707974138,
"loss": 0.1526,
"rewards/chosen": 0.7418512318232288,
"rewards/margins": 7.8209843642316335,
"rewards/rejected": -7.079133132408405,
"step": 640
},
{
"epoch": 8.27936507936508,
"grad_norm": 1.8452895879745483,
"kl": 6.364190101623535,
"learning_rate": 4.1128047146765936e-07,
"logits/chosen": -4475600.963855422,
"logits/rejected": -6167757.714285715,
"logps/chosen": -10.917626070689005,
"logps/rejected": -88.98798954951299,
"loss": 0.1721,
"rewards/chosen": 0.7498773781650038,
"rewards/margins": 7.733158501338705,
"rewards/rejected": -6.983281123173701,
"step": 650
},
{
"epoch": 8.27936507936508,
"eval_logits/chosen": -4329249.523809524,
"eval_logits/rejected": -6320159.584415585,
"eval_logps/chosen": -17.140528118799605,
"eval_logps/rejected": -87.93011870941558,
"eval_loss": 0.21569015085697174,
"eval_rewards/chosen": -0.12817655290876115,
"eval_rewards/margins": 6.729884209570947,
"eval_rewards/rejected": -6.858060762479708,
"eval_runtime": 28.9092,
"eval_samples_per_second": 4.843,
"eval_steps_per_second": 2.421,
"kl": 0.0,
"step": 650
},
{
"epoch": 8.406349206349207,
"grad_norm": 0.17593051493167877,
"kl": 0.0,
"learning_rate": 3.51913118594458e-07,
"logits/chosen": -4085644.8,
"logits/rejected": -7389671.314285714,
"logps/chosen": -19.100984700520833,
"logps/rejected": -98.90892159598214,
"loss": 0.2138,
"rewards/chosen": 0.05359976026746962,
"rewards/margins": 7.967655284821041,
"rewards/rejected": -7.9140555245535715,
"step": 660
},
{
"epoch": 8.533333333333333,
"grad_norm": 0.11416521668434143,
"kl": 0.0,
"learning_rate": 2.9684711669750313e-07,
"logits/chosen": -5479745.471264368,
"logits/rejected": -7563712.0,
"logps/chosen": -26.79979907507184,
"logps/rejected": -94.35269959332192,
"loss": 0.2584,
"rewards/chosen": -0.9998831913389009,
"rewards/margins": 6.4843028505061335,
"rewards/rejected": -7.484186041845034,
"step": 670
},
{
"epoch": 8.66031746031746,
"grad_norm": 17.24502182006836,
"kl": 0.0,
"learning_rate": 2.4619273049796e-07,
"logits/chosen": -4943775.555555556,
"logits/rejected": -6408256.0,
"logps/chosen": -10.501305474175346,
"logps/rejected": -97.72674005681819,
"loss": 0.1406,
"rewards/chosen": 0.6181484858194987,
"rewards/margins": 8.29336793494947,
"rewards/rejected": -7.675219449129972,
"step": 680
},
{
"epoch": 8.787301587301588,
"grad_norm": 2.4637882709503174,
"kl": 0.0,
"learning_rate": 2.0005139085293945e-07,
"logits/chosen": -5370018.285714285,
"logits/rejected": -4552856.0,
"logps/chosen": -12.256514776320685,
"logps/rejected": -95.3783023231908,
"loss": 0.19,
"rewards/chosen": 0.5139439446585519,
"rewards/margins": 8.120755468096052,
"rewards/rejected": -7.6068115234375,
"step": 690
},
{
"epoch": 8.914285714285715,
"grad_norm": 5.328949928283691,
"kl": 0.0,
"learning_rate": 1.5851549164932118e-07,
"logits/chosen": -4298147.7402597405,
"logits/rejected": -6892323.469879518,
"logps/chosen": -12.497201894784903,
"logps/rejected": -95.17148672816265,
"loss": 0.1879,
"rewards/chosen": 0.4705321572043679,
"rewards/margins": 7.979313866071085,
"rewards/rejected": -7.508781708866717,
"step": 700
},
{
"epoch": 8.914285714285715,
"eval_logits/chosen": -4337002.666666667,
"eval_logits/rejected": -6325528.935064935,
"eval_logps/chosen": -17.09599376860119,
"eval_logps/rejected": -87.86666751217533,
"eval_loss": 0.21406607329845428,
"eval_rewards/chosen": -0.123722961970738,
"eval_rewards/margins": 6.7279925222520705,
"eval_rewards/rejected": -6.851715484222808,
"eval_runtime": 28.9604,
"eval_samples_per_second": 4.834,
"eval_steps_per_second": 2.417,
"kl": 0.0,
"step": 700
},
{
"epoch": 9.044444444444444,
"grad_norm": 1.3945693969726562,
"kl": 0.0,
"learning_rate": 1.2166820479329572e-07,
"logits/chosen": -4539691.317073171,
"logits/rejected": -7435501.128205128,
"logps/chosen": -17.02510480182927,
"logps/rejected": -93.79669345953526,
"loss": 0.2091,
"rewards/chosen": 0.06722603774652249,
"rewards/margins": 7.387677823103092,
"rewards/rejected": -7.32045178535657,
"step": 710
},
{
"epoch": 9.17142857142857,
"grad_norm": 1.6716283559799194,
"kl": 0.0,
"learning_rate": 8.958331366609424e-08,
"logits/chosen": -4157219.2,
"logits/rejected": -6966880.711111112,
"logps/chosen": -10.706447056361608,
"logps/rejected": -92.81272786458334,
"loss": 0.1464,
"rewards/chosen": 0.6557066781180245,
"rewards/margins": 8.015752386668371,
"rewards/rejected": -7.360045708550347,
"step": 720
},
{
"epoch": 9.2984126984127,
"grad_norm": 0.8815398812294006,
"kl": 0.0,
"learning_rate": 6.232506537939942e-08,
"logits/chosen": -3989393.734939759,
"logits/rejected": -5874582.441558441,
"logps/chosen": -15.733169004141565,
"logps/rejected": -99.3365145596591,
"loss": 0.1782,
"rewards/chosen": 0.12129955981151168,
"rewards/margins": 8.070443006696658,
"rewards/rejected": -7.949143446885146,
"step": 730
},
{
"epoch": 9.425396825396826,
"grad_norm": 0.6676042079925537,
"kl": 0.0,
"learning_rate": 3.994804212627462e-08,
"logits/chosen": -4523181.714285715,
"logits/rejected": -7066150.554216867,
"logps/chosen": -17.300652838372564,
"logps/rejected": -97.79651614269578,
"loss": 0.1915,
"rewards/chosen": -0.0630725266097428,
"rewards/margins": 7.753394023183179,
"rewards/rejected": -7.8164665497929215,
"step": 740
},
{
"epoch": 9.552380952380952,
"grad_norm": 0.48196038603782654,
"kl": 0.0,
"learning_rate": 2.2497051885228825e-08,
"logits/chosen": -4720270.769230769,
"logits/rejected": -6136928.463768116,
"logps/chosen": -19.533128004807693,
"logps/rejected": -87.8185009057971,
"loss": 0.2335,
"rewards/chosen": -0.10939908289647364,
"rewards/margins": 6.794888946428708,
"rewards/rejected": -6.904288029325182,
"step": 750
},
{
"epoch": 9.552380952380952,
"eval_logits/chosen": -4329628.444444444,
"eval_logits/rejected": -6335641.35064935,
"eval_logps/chosen": -17.1407470703125,
"eval_logps/rejected": -87.96758319805195,
"eval_loss": 0.21577061712741852,
"eval_rewards/chosen": -0.12819823007734996,
"eval_rewards/margins": 6.733608664200247,
"eval_rewards/rejected": -6.861806894277597,
"eval_runtime": 28.9438,
"eval_samples_per_second": 4.837,
"eval_steps_per_second": 2.418,
"kl": 0.0,
"step": 750
},
{
"epoch": 9.679365079365079,
"grad_norm": 1.612648844718933,
"kl": 0.0,
"learning_rate": 1.0007038696262517e-08,
"logits/chosen": -5430106.046511628,
"logits/rejected": -6555316.324324325,
"logps/chosen": -11.707361265670421,
"logps/rejected": -99.71012589738176,
"loss": 0.1738,
"rewards/chosen": 0.5538684933684593,
"rewards/margins": 8.572764647823695,
"rewards/rejected": -8.018896154455236,
"step": 760
},
{
"epoch": 9.806349206349207,
"grad_norm": 0.2559433877468109,
"kl": 0.0,
"learning_rate": 2.5030126885694505e-09,
"logits/chosen": -4649945.142857143,
"logits/rejected": -6114650.52631579,
"logps/chosen": -11.616304670061384,
"logps/rejected": -92.70840614720395,
"loss": 0.1815,
"rewards/chosen": 0.6737716311500186,
"rewards/margins": 7.9078245903913835,
"rewards/rejected": -7.234052959241365,
"step": 770
},
{
"epoch": 9.933333333333334,
"grad_norm": 0.04368880018591881,
"kl": 0.0,
"learning_rate": 0.0,
"logits/chosen": -4589314.493506493,
"logits/rejected": -6493096.86746988,
"logps/chosen": -12.316443505225244,
"logps/rejected": -93.54054499246988,
"loss": 0.1779,
"rewards/chosen": 0.5961562317687196,
"rewards/margins": 8.000441340955467,
"rewards/rejected": -7.404285109186747,
"step": 780
},
{
"epoch": 9.933333333333334,
"step": 780,
"total_flos": 6.965271112148582e+16,
"train_loss": 0.26206854444283706,
"train_runtime": 4670.2135,
"train_samples_per_second": 2.698,
"train_steps_per_second": 0.167
}
],
"logging_steps": 10,
"max_steps": 780,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 6.965271112148582e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}