Llama-3.1-8B-Instruct-KTO-500 / trainer_state.json
chchen's picture
End of training
452fc0c verified
{
"best_metric": 0.2732886075973511,
"best_model_checkpoint": "saves/sycophancy/Llama-8B-3.1-Instruct/kto-500/checkpoint-500",
"epoch": 9.955555555555556,
"eval_steps": 50,
"global_step": 560,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.17777777777777778,
"grad_norm": 0.6533107161521912,
"kl": 4.067477226257324,
"learning_rate": 8.928571428571429e-07,
"logits/chosen": -6841018.329113924,
"logits/rejected": -7836444.444444444,
"logps/chosen": -16.606612193433545,
"logps/rejected": -19.352891710069443,
"loss": 0.5,
"rewards/chosen": 0.0014614655247217492,
"rewards/margins": -0.000693329951226255,
"rewards/rejected": 0.0021547954759480042,
"step": 10
},
{
"epoch": 0.35555555555555557,
"grad_norm": 0.6705703139305115,
"kl": 5.545372486114502,
"learning_rate": 1.7857142857142859e-06,
"logits/chosen": -6803992.303797469,
"logits/rejected": -7634997.728395062,
"logps/chosen": -17.628129017503955,
"logps/rejected": -19.349335093557098,
"loss": 0.5003,
"rewards/chosen": -0.001114800006528444,
"rewards/margins": -0.002355615283981266,
"rewards/rejected": 0.0012408152774528221,
"step": 20
},
{
"epoch": 0.5333333333333333,
"grad_norm": 0.6907099485397339,
"kl": 2.6640281677246094,
"learning_rate": 2.6785714285714285e-06,
"logits/chosen": -6161688.746666667,
"logits/rejected": -7775072.376470588,
"logps/chosen": -16.01873046875,
"logps/rejected": -19.69498650045956,
"loss": 0.499,
"rewards/chosen": 0.0048561612764994305,
"rewards/margins": 0.005783557833409777,
"rewards/rejected": -0.0009273965569103465,
"step": 30
},
{
"epoch": 0.7111111111111111,
"grad_norm": 1.0000953674316406,
"kl": 3.6405258178710938,
"learning_rate": 3.5714285714285718e-06,
"logits/chosen": -6069453.76744186,
"logits/rejected": -7335607.3513513515,
"logps/chosen": -16.918597554051598,
"logps/rejected": -19.19496608424831,
"loss": 0.4999,
"rewards/chosen": 0.0016274835134661475,
"rewards/margins": 0.000747067534196759,
"rewards/rejected": 0.0008804159792693885,
"step": 40
},
{
"epoch": 0.8888888888888888,
"grad_norm": 0.7750270962715149,
"kl": 4.009792327880859,
"learning_rate": 4.464285714285715e-06,
"logits/chosen": -5697954.37037037,
"logits/rejected": -7671765.873417721,
"logps/chosen": -16.696614583333332,
"logps/rejected": -19.640096543710442,
"loss": 0.499,
"rewards/chosen": 0.016511881792986835,
"rewards/margins": 0.008323215370830996,
"rewards/rejected": 0.008188666422155839,
"step": 50
},
{
"epoch": 0.8888888888888888,
"eval_logits/chosen": -6797012.48,
"eval_logits/rejected": -7105245.44,
"eval_logps/chosen": -15.1082421875,
"eval_logps/rejected": -19.328565673828123,
"eval_loss": 0.49956679344177246,
"eval_rewards/chosen": 0.01544377326965332,
"eval_rewards/margins": -0.0007611429691314706,
"eval_rewards/rejected": 0.01620491623878479,
"eval_runtime": 20.7819,
"eval_samples_per_second": 4.812,
"eval_steps_per_second": 2.406,
"kl": 3.8203978538513184,
"step": 50
},
{
"epoch": 1.0666666666666667,
"grad_norm": 0.760673463344574,
"kl": 5.259383201599121,
"learning_rate": 4.999222955002041e-06,
"logits/chosen": -6167793.6,
"logits/rejected": -6915556.0,
"logps/chosen": -18.555807495117186,
"logps/rejected": -20.55987091064453,
"loss": 0.4979,
"rewards/chosen": 0.030471977591514588,
"rewards/margins": 0.017126622796058658,
"rewards/rejected": 0.013345354795455932,
"step": 60
},
{
"epoch": 1.2444444444444445,
"grad_norm": 0.8709146976470947,
"kl": 5.821277618408203,
"learning_rate": 4.990486745229364e-06,
"logits/chosen": -5566239.567567567,
"logits/rejected": -7954213.209302326,
"logps/chosen": -16.395964751372468,
"logps/rejected": -19.50782101653343,
"loss": 0.4952,
"rewards/chosen": 0.06164715096757219,
"rewards/margins": 0.048601815042669555,
"rewards/rejected": 0.013045335924902628,
"step": 70
},
{
"epoch": 1.4222222222222223,
"grad_norm": 0.9092549681663513,
"kl": 7.013403415679932,
"learning_rate": 4.9720770655628216e-06,
"logits/chosen": -6926604.8,
"logits/rejected": -7139911.771428571,
"logps/chosen": -16.64070095486111,
"logps/rejected": -18.566683523995536,
"loss": 0.4886,
"rewards/chosen": 0.08301433987087674,
"rewards/margins": 0.08200664118168846,
"rewards/rejected": 0.0010076986891882761,
"step": 80
},
{
"epoch": 1.6,
"grad_norm": 1.182418942451477,
"kl": 17.02285385131836,
"learning_rate": 4.944065422298262e-06,
"logits/chosen": -6318053.6,
"logits/rejected": -6837182.4,
"logps/chosen": -15.03062744140625,
"logps/rejected": -20.429293823242187,
"loss": 0.4805,
"rewards/chosen": 0.1787983775138855,
"rewards/margins": 0.15114764571189881,
"rewards/rejected": 0.027650731801986694,
"step": 90
},
{
"epoch": 1.7777777777777777,
"grad_norm": 1.1860688924789429,
"kl": 15.505624771118164,
"learning_rate": 4.90656061737503e-06,
"logits/chosen": -6204396.883116883,
"logits/rejected": -6490921.638554217,
"logps/chosen": -14.448405476359579,
"logps/rejected": -20.421110222138555,
"loss": 0.4715,
"rewards/chosen": 0.25868542163403,
"rewards/margins": 0.23589365106665308,
"rewards/rejected": 0.02279177056737693,
"step": 100
},
{
"epoch": 1.7777777777777777,
"eval_logits/chosen": -6498615.68,
"eval_logits/rejected": -7022910.72,
"eval_logps/chosen": -12.5245361328125,
"eval_logps/rejected": -19.137769775390623,
"eval_loss": 0.47049248218536377,
"eval_rewards/chosen": 0.27381431579589843,
"eval_rewards/margins": 0.23852983951568602,
"eval_rewards/rejected": 0.035284476280212404,
"eval_runtime": 20.7743,
"eval_samples_per_second": 4.814,
"eval_steps_per_second": 2.407,
"kl": 12.469597816467285,
"step": 100
},
{
"epoch": 1.9555555555555557,
"grad_norm": 1.5210201740264893,
"kl": 10.133670806884766,
"learning_rate": 4.859708325770919e-06,
"logits/chosen": -5213267.692307692,
"logits/rejected": -8371516.87804878,
"logps/chosen": -14.842853252704327,
"logps/rejected": -20.327931473894818,
"loss": 0.4463,
"rewards/chosen": 0.31009825682028747,
"rewards/margins": 0.42884164157698046,
"rewards/rejected": -0.11874338475669302,
"step": 110
},
{
"epoch": 2.1333333333333333,
"grad_norm": 1.9936622381210327,
"kl": 1.3701114654541016,
"learning_rate": 4.80369052967602e-06,
"logits/chosen": -5974752.0,
"logits/rejected": -6628083.368421053,
"logps/chosen": -13.366662888299851,
"logps/rejected": -21.737105520148027,
"loss": 0.4121,
"rewards/chosen": 0.37865970248267766,
"rewards/margins": 0.7291635236046965,
"rewards/rejected": -0.3505038211220189,
"step": 120
},
{
"epoch": 2.311111111111111,
"grad_norm": 1.529518961906433,
"kl": 4.181853294372559,
"learning_rate": 4.7387248116432524e-06,
"logits/chosen": -5894920.421052632,
"logits/rejected": -6893739.428571428,
"logps/chosen": -13.183826647306743,
"logps/rejected": -23.257681710379465,
"loss": 0.4071,
"rewards/chosen": 0.38488915092066717,
"rewards/margins": 0.776886170370537,
"rewards/rejected": -0.3919970194498698,
"step": 130
},
{
"epoch": 2.488888888888889,
"grad_norm": 2.2388455867767334,
"kl": 11.135213851928711,
"learning_rate": 4.665063509461098e-06,
"logits/chosen": -4817724.0,
"logits/rejected": -7434909.090909091,
"logps/chosen": -12.073420206705729,
"logps/rejected": -26.122675115411933,
"loss": 0.3807,
"rewards/chosen": 0.41833003362019855,
"rewards/margins": 1.0169767610954517,
"rewards/rejected": -0.598646727475253,
"step": 140
},
{
"epoch": 2.6666666666666665,
"grad_norm": 2.9637298583984375,
"kl": 0.0,
"learning_rate": 4.5829927360311224e-06,
"logits/chosen": -4782145.406593407,
"logits/rejected": -6822974.144927536,
"logps/chosen": -14.056957202953297,
"logps/rejected": -32.71091627038044,
"loss": 0.3459,
"rewards/chosen": 0.36075919015066965,
"rewards/margins": 1.566741674829961,
"rewards/rejected": -1.2059824846792913,
"step": 150
},
{
"epoch": 2.6666666666666665,
"eval_logits/chosen": -5608189.44,
"eval_logits/rejected": -6636171.52,
"eval_logps/chosen": -14.359583740234376,
"eval_logps/rejected": -30.49203369140625,
"eval_loss": 0.3755200207233429,
"eval_rewards/chosen": 0.090309476852417,
"eval_rewards/margins": 1.1904513835906982,
"eval_rewards/rejected": -1.1001419067382812,
"eval_runtime": 20.8081,
"eval_samples_per_second": 4.806,
"eval_steps_per_second": 2.403,
"kl": 0.7521286010742188,
"step": 150
},
{
"epoch": 2.8444444444444446,
"grad_norm": 2.555079698562622,
"kl": 2.267221450805664,
"learning_rate": 4.492831268057307e-06,
"logits/chosen": -3827353.7560975607,
"logits/rejected": -6784299.487179487,
"logps/chosen": -13.25042129144436,
"logps/rejected": -33.39762682792468,
"loss": 0.3292,
"rewards/chosen": 0.3535805678949123,
"rewards/margins": 1.744958921698498,
"rewards/rejected": -1.3913783538035858,
"step": 160
},
{
"epoch": 3.022222222222222,
"grad_norm": 1.4790682792663574,
"kl": 0.0831449031829834,
"learning_rate": 4.394929307863633e-06,
"logits/chosen": -5203950.702702703,
"logits/rejected": -6706759.441860465,
"logps/chosen": -15.109988650760135,
"logps/rejected": -37.56380870730378,
"loss": 0.2896,
"rewards/chosen": 0.43377953606682856,
"rewards/margins": 2.1837287917218067,
"rewards/rejected": -1.7499492556549783,
"step": 170
},
{
"epoch": 3.2,
"grad_norm": 1.453729271888733,
"kl": 1.8771247863769531,
"learning_rate": 4.289667123149296e-06,
"logits/chosen": -4639765.333333333,
"logits/rejected": -6930286.6823529415,
"logps/chosen": -13.8180029296875,
"logps/rejected": -40.942793543198526,
"loss": 0.2707,
"rewards/chosen": 0.4121772257486979,
"rewards/margins": 2.442220165776271,
"rewards/rejected": -2.0300429400275735,
"step": 180
},
{
"epoch": 3.3777777777777778,
"grad_norm": 1.3966221809387207,
"kl": 9.155083656311035,
"learning_rate": 4.177453569964925e-06,
"logits/chosen": -5388630.857142857,
"logits/rejected": -6755297.684210527,
"logps/chosen": -12.648887997581845,
"logps/rejected": -39.70372250205592,
"loss": 0.2847,
"rewards/chosen": 0.5054586501348586,
"rewards/margins": 2.584699549471824,
"rewards/rejected": -2.0792408993369653,
"step": 190
},
{
"epoch": 3.5555555555555554,
"grad_norm": 6.537291526794434,
"kl": 0.0,
"learning_rate": 4.058724504646834e-06,
"logits/chosen": -4172365.879518072,
"logits/rejected": -6079465.558441559,
"logps/chosen": -14.219998646931476,
"logps/rejected": -46.81524896002435,
"loss": 0.2879,
"rewards/chosen": 0.2552003975374153,
"rewards/margins": 2.85085913333287,
"rewards/rejected": -2.5956587357954546,
"step": 200
},
{
"epoch": 3.5555555555555554,
"eval_logits/chosen": -5304341.76,
"eval_logits/rejected": -6579046.4,
"eval_logps/chosen": -15.011697998046875,
"eval_logps/rejected": -42.5760400390625,
"eval_loss": 0.3254357576370239,
"eval_rewards/chosen": 0.025098147392272948,
"eval_rewards/margins": 2.3336407804489134,
"eval_rewards/rejected": -2.3085426330566405,
"eval_runtime": 20.7983,
"eval_samples_per_second": 4.808,
"eval_steps_per_second": 2.404,
"kl": 0.0,
"step": 200
},
{
"epoch": 3.7333333333333334,
"grad_norm": 2.430555820465088,
"kl": 0.0,
"learning_rate": 3.933941090877615e-06,
"logits/chosen": -4026738.4,
"logits/rejected": -6801368.8,
"logps/chosen": -11.006294250488281,
"logps/rejected": -45.532797241210936,
"loss": 0.2517,
"rewards/chosen": 0.5302759170532226,
"rewards/margins": 3.205550193786621,
"rewards/rejected": -2.6752742767333983,
"step": 210
},
{
"epoch": 3.911111111111111,
"grad_norm": 1.6560513973236084,
"kl": 3.6237878799438477,
"learning_rate": 3.8035880084487454e-06,
"logits/chosen": -4789041.230769231,
"logits/rejected": -7053110.634146341,
"logps/chosen": -10.559817583133013,
"logps/rejected": -46.676900724085364,
"loss": 0.2392,
"rewards/chosen": 0.6918807885585687,
"rewards/margins": 3.4286504880274142,
"rewards/rejected": -2.7367696994688453,
"step": 220
},
{
"epoch": 4.088888888888889,
"grad_norm": 1.3882936239242554,
"kl": 1.455021858215332,
"learning_rate": 3.6681715706826555e-06,
"logits/chosen": -4748690.823529412,
"logits/rejected": -6739801.6,
"logps/chosen": -9.264028033088236,
"logps/rejected": -50.18566080729167,
"loss": 0.2326,
"rewards/chosen": 0.8066374834846047,
"rewards/margins": 3.8256635658413756,
"rewards/rejected": -3.019026082356771,
"step": 230
},
{
"epoch": 4.266666666666667,
"grad_norm": 1.702983021736145,
"kl": 0.0,
"learning_rate": 3.5282177578265295e-06,
"logits/chosen": -4685888.7710843375,
"logits/rejected": -6661258.389610389,
"logps/chosen": -7.184921816170934,
"logps/rejected": -53.11224761566559,
"loss": 0.2209,
"rewards/chosen": 0.9436166602444936,
"rewards/margins": 4.360447540545833,
"rewards/rejected": -3.4168308803013394,
"step": 240
},
{
"epoch": 4.444444444444445,
"grad_norm": 0.7544198036193848,
"kl": 0.0,
"learning_rate": 3.384270174056454e-06,
"logits/chosen": -4689330.8,
"logits/rejected": -6276662.8,
"logps/chosen": -12.63294677734375,
"logps/rejected": -61.84871826171875,
"loss": 0.2319,
"rewards/chosen": 0.31192424297332766,
"rewards/margins": 4.568389391899109,
"rewards/rejected": -4.256465148925781,
"step": 250
},
{
"epoch": 4.444444444444445,
"eval_logits/chosen": -5472120.32,
"eval_logits/rejected": -6862511.36,
"eval_logps/chosen": -16.78877197265625,
"eval_logps/rejected": -54.95546875,
"eval_loss": 0.3014616072177887,
"eval_rewards/chosen": -0.1526092529296875,
"eval_rewards/margins": 3.393876037597656,
"eval_rewards/rejected": -3.5464852905273436,
"eval_runtime": 20.809,
"eval_samples_per_second": 4.806,
"eval_steps_per_second": 2.403,
"kl": 3.790660858154297,
"step": 250
},
{
"epoch": 4.622222222222222,
"grad_norm": 1.594618797302246,
"kl": 0.0,
"learning_rate": 3.236887936027261e-06,
"logits/chosen": -5005965.552941176,
"logits/rejected": -6077611.52,
"logps/chosen": -12.832533892463236,
"logps/rejected": -61.87927083333334,
"loss": 0.2328,
"rewards/chosen": 0.6093355066636029,
"rewards/margins": 4.77091916551777,
"rewards/rejected": -4.161583658854167,
"step": 260
},
{
"epoch": 4.8,
"grad_norm": 14.283431053161621,
"kl": 0.0,
"learning_rate": 3.0866435011692884e-06,
"logits/chosen": -3974209.969230769,
"logits/rejected": -7320299.115789474,
"logps/chosen": -12.673212139423077,
"logps/rejected": -58.971828741776314,
"loss": 0.222,
"rewards/chosen": 0.35330341045673075,
"rewards/margins": 4.22993337036627,
"rewards/rejected": -3.8766299599095393,
"step": 270
},
{
"epoch": 4.977777777777778,
"grad_norm": 6.273312091827393,
"kl": 0.0,
"learning_rate": 2.9341204441673267e-06,
"logits/chosen": -4811573.7011494255,
"logits/rejected": -6532414.2465753425,
"logps/chosen": -12.03460903825431,
"logps/rejected": -72.13428403253425,
"loss": 0.2032,
"rewards/chosen": 0.4564220384619702,
"rewards/margins": 5.712377146686499,
"rewards/rejected": -5.255955108224529,
"step": 280
},
{
"epoch": 5.155555555555556,
"grad_norm": 1.409645676612854,
"kl": 0.0,
"learning_rate": 2.7799111902582697e-06,
"logits/chosen": -5135937.641025641,
"logits/rejected": -7215049.365853659,
"logps/chosen": -13.245896559495192,
"logps/rejected": -71.7289979516006,
"loss": 0.2097,
"rewards/chosen": 0.39485403207632214,
"rewards/margins": 5.603232521500865,
"rewards/rejected": -5.208378489424542,
"step": 290
},
{
"epoch": 5.333333333333333,
"grad_norm": 1.3076075315475464,
"kl": 0.0,
"learning_rate": 2.624614714151743e-06,
"logits/chosen": -4269975.518072289,
"logits/rejected": -6750665.974025974,
"logps/chosen": -9.87456392954631,
"logps/rejected": -71.41611074472402,
"loss": 0.1971,
"rewards/chosen": 0.6907072411962303,
"rewards/margins": 5.822813477245337,
"rewards/rejected": -5.132106236049107,
"step": 300
},
{
"epoch": 5.333333333333333,
"eval_logits/chosen": -5352769.92,
"eval_logits/rejected": -6700284.16,
"eval_logps/chosen": -20.0192236328125,
"eval_logps/rejected": -65.3341015625,
"eval_loss": 0.2927246391773224,
"eval_rewards/chosen": -0.4756543731689453,
"eval_rewards/margins": 4.108694686889649,
"eval_rewards/rejected": -4.584349060058594,
"eval_runtime": 20.7933,
"eval_samples_per_second": 4.809,
"eval_steps_per_second": 2.405,
"kl": 0.0,
"step": 300
},
{
"epoch": 5.511111111111111,
"grad_norm": 1.6624188423156738,
"kl": 0.0,
"learning_rate": 2.4688342135114625e-06,
"logits/chosen": -4672509.506493507,
"logits/rejected": -6401643.951807229,
"logps/chosen": -14.955799449573863,
"logps/rejected": -73.92682840737952,
"loss": 0.2184,
"rewards/chosen": 0.1713225996339476,
"rewards/margins": 5.420992421499535,
"rewards/rejected": -5.249669821865587,
"step": 310
},
{
"epoch": 5.688888888888889,
"grad_norm": 1.1759079694747925,
"kl": 0.0,
"learning_rate": 2.3131747660339396e-06,
"logits/chosen": -3520974.9873417723,
"logits/rejected": -6545384.691358024,
"logps/chosen": -12.296756792672072,
"logps/rejected": -71.31258439429013,
"loss": 0.2118,
"rewards/chosen": 0.5269823677932159,
"rewards/margins": 5.867736370493833,
"rewards/rejected": -5.340754002700617,
"step": 320
},
{
"epoch": 5.866666666666667,
"grad_norm": 0.6257539391517639,
"kl": 3.781810760498047,
"learning_rate": 2.158240979224817e-06,
"logits/chosen": -3069954.6329113925,
"logits/rejected": -6776339.75308642,
"logps/chosen": -10.886953619462025,
"logps/rejected": -71.04876181520062,
"loss": 0.1733,
"rewards/chosen": 0.8049045514456833,
"rewards/margins": 5.933409175047149,
"rewards/rejected": -5.128504623601466,
"step": 330
},
{
"epoch": 6.044444444444444,
"grad_norm": 0.3074154853820801,
"kl": 0.0,
"learning_rate": 2.004634642001507e-06,
"logits/chosen": -5546165.6,
"logits/rejected": -5844707.2,
"logps/chosen": -7.493644714355469,
"logps/rejected": -72.3456787109375,
"loss": 0.189,
"rewards/chosen": 1.0212175369262695,
"rewards/margins": 6.292597389221192,
"rewards/rejected": -5.271379852294922,
"step": 340
},
{
"epoch": 6.222222222222222,
"grad_norm": 0.8704222440719604,
"kl": 2.9211883544921875,
"learning_rate": 1.852952387243698e-06,
"logits/chosen": -3859937.2467532465,
"logits/rejected": -6969344.0,
"logps/chosen": -13.282477044439934,
"logps/rejected": -82.35998682228916,
"loss": 0.1825,
"rewards/chosen": 0.44339762105570213,
"rewards/margins": 6.771900597806455,
"rewards/rejected": -6.328502976750753,
"step": 350
},
{
"epoch": 6.222222222222222,
"eval_logits/chosen": -5317811.2,
"eval_logits/rejected": -6587809.28,
"eval_logps/chosen": -21.9388818359375,
"eval_logps/rejected": -70.896962890625,
"eval_loss": 0.2855421304702759,
"eval_rewards/chosen": -0.6676201629638672,
"eval_rewards/margins": 4.473014602661133,
"eval_rewards/rejected": -5.140634765625,
"eval_runtime": 20.782,
"eval_samples_per_second": 4.812,
"eval_steps_per_second": 2.406,
"kl": 0.0,
"step": 350
},
{
"epoch": 6.4,
"grad_norm": 2.186678886413574,
"kl": 0.0,
"learning_rate": 1.7037833743707892e-06,
"logits/chosen": -3891186.701298701,
"logits/rejected": -6218544.192771085,
"logps/chosen": -11.341275301846592,
"logps/rejected": -76.01284826807229,
"loss": 0.2035,
"rewards/chosen": 0.5609105097783076,
"rewards/margins": 6.146511948737269,
"rewards/rejected": -5.585601438958961,
"step": 360
},
{
"epoch": 6.5777777777777775,
"grad_norm": 0.6433426141738892,
"kl": 0.0,
"learning_rate": 1.5577070009474872e-06,
"logits/chosen": -4631077.608247423,
"logits/rejected": -5847530.158730159,
"logps/chosen": -11.66492328447165,
"logps/rejected": -78.18622116815476,
"loss": 0.2251,
"rewards/chosen": 0.4717446160070675,
"rewards/margins": 6.37879097947433,
"rewards/rejected": -5.907046363467262,
"step": 370
},
{
"epoch": 6.7555555555555555,
"grad_norm": 1.106202483177185,
"kl": 0.0,
"learning_rate": 1.415290652206105e-06,
"logits/chosen": -4495871.157894737,
"logits/rejected": -6737105.523809524,
"logps/chosen": -9.170941804584704,
"logps/rejected": -74.34019252232143,
"loss": 0.2006,
"rewards/chosen": 0.8608366313733553,
"rewards/margins": 6.370171740539092,
"rewards/rejected": -5.509335109165737,
"step": 380
},
{
"epoch": 6.933333333333334,
"grad_norm": 0.3888777494430542,
"kl": 2.491642951965332,
"learning_rate": 1.2770874972267777e-06,
"logits/chosen": -4726923.341772152,
"logits/rejected": -6615125.333333333,
"logps/chosen": -11.210519525069225,
"logps/rejected": -79.0329258294753,
"loss": 0.1732,
"rewards/chosen": 0.7415898962865902,
"rewards/margins": 6.530411646503604,
"rewards/rejected": -5.788821750217014,
"step": 390
},
{
"epoch": 7.111111111111111,
"grad_norm": 1.2314908504486084,
"kl": 0.0,
"learning_rate": 1.1436343403356019e-06,
"logits/chosen": -3235399.314285714,
"logits/rejected": -6684086.755555555,
"logps/chosen": -16.83338623046875,
"logps/rejected": -76.9798611111111,
"loss": 0.1996,
"rewards/chosen": -0.024504613876342774,
"rewards/margins": 5.710608911514282,
"rewards/rejected": -5.735113525390625,
"step": 400
},
{
"epoch": 7.111111111111111,
"eval_logits/chosen": -5272716.48,
"eval_logits/rejected": -6501901.44,
"eval_logps/chosen": -22.1139404296875,
"eval_logps/rejected": -72.409404296875,
"eval_loss": 0.28036418557167053,
"eval_rewards/chosen": -0.6851260375976562,
"eval_rewards/margins": 4.606752624511719,
"eval_rewards/rejected": -5.291878662109375,
"eval_runtime": 20.812,
"eval_samples_per_second": 4.805,
"eval_steps_per_second": 2.402,
"kl": 0.0,
"step": 400
},
{
"epoch": 7.288888888888889,
"grad_norm": 11.075384140014648,
"kl": 0.0,
"learning_rate": 1.0154495360662464e-06,
"logits/chosen": -5561997.333333333,
"logits/rejected": -4703039.157894737,
"logps/chosen": -13.027040027436756,
"logps/rejected": -82.51130114103618,
"loss": 0.2111,
"rewards/chosen": 0.3391632352556501,
"rewards/margins": 6.753453010903264,
"rewards/rejected": -6.414289775647615,
"step": 410
},
{
"epoch": 7.466666666666667,
"grad_norm": 5.149106502532959,
"kl": 0.0,
"learning_rate": 8.930309757836517e-07,
"logits/chosen": -3194344.6486486485,
"logits/rejected": -6930841.302325581,
"logps/chosen": -12.075965675147804,
"logps/rejected": -80.88395371547965,
"loss": 0.1964,
"rewards/chosen": 0.4253324560216955,
"rewards/margins": 6.625286466741172,
"rewards/rejected": -6.199954010719477,
"step": 420
},
{
"epoch": 7.644444444444445,
"grad_norm": 1.6590875387191772,
"kl": 0.0,
"learning_rate": 7.768541537901325e-07,
"logits/chosen": -4468182.4,
"logits/rejected": -6386037.942857143,
"logps/chosen": -7.90546129014757,
"logps/rejected": -78.26188616071428,
"loss": 0.1915,
"rewards/chosen": 0.8592787848578559,
"rewards/margins": 6.713061656649151,
"rewards/rejected": -5.853782871791295,
"step": 430
},
{
"epoch": 7.822222222222222,
"grad_norm": 0.5123002529144287,
"kl": 11.214415550231934,
"learning_rate": 6.673703204254348e-07,
"logits/chosen": -1876414.5777777778,
"logits/rejected": -7622743.771428571,
"logps/chosen": -12.83230251736111,
"logps/rejected": -82.74340122767858,
"loss": 0.1886,
"rewards/chosen": 0.6239681667751736,
"rewards/margins": 6.8449258955698165,
"rewards/rejected": -6.220957728794643,
"step": 440
},
{
"epoch": 8.0,
"grad_norm": 0.4275095760822296,
"kl": 0.0,
"learning_rate": 5.650047293344316e-07,
"logits/chosen": -5781915.701492538,
"logits/rejected": -6046539.698924731,
"logps/chosen": -13.672110030900187,
"logps/rejected": -80.53140225974462,
"loss": 0.1776,
"rewards/chosen": 0.4122744318264634,
"rewards/margins": 6.3341966848506575,
"rewards/rejected": -5.921922253024194,
"step": 450
},
{
"epoch": 8.0,
"eval_logits/chosen": -5282150.4,
"eval_logits/rejected": -6475059.2,
"eval_logps/chosen": -22.7550830078125,
"eval_logps/rejected": -74.3277001953125,
"eval_loss": 0.27525943517684937,
"eval_rewards/chosen": -0.7492402648925781,
"eval_rewards/margins": 4.734468231201172,
"eval_rewards/rejected": -5.48370849609375,
"eval_runtime": 20.8139,
"eval_samples_per_second": 4.804,
"eval_steps_per_second": 2.402,
"kl": 0.0,
"step": 450
},
{
"epoch": 8.177777777777777,
"grad_norm": 6.894736289978027,
"kl": 0.0,
"learning_rate": 4.7015498571035877e-07,
"logits/chosen": -3783783.5061728396,
"logits/rejected": -6069178.734177215,
"logps/chosen": -5.430916868610146,
"logps/rejected": -82.43604751780063,
"loss": 0.1634,
"rewards/chosen": 0.9972056636103878,
"rewards/margins": 7.210469452920119,
"rewards/rejected": -6.213263789309731,
"step": 460
},
{
"epoch": 8.355555555555556,
"grad_norm": 0.09748721122741699,
"kl": 0.0,
"learning_rate": 3.831895019292897e-07,
"logits/chosen": -7059043.7402597405,
"logits/rejected": -5150067.277108434,
"logps/chosen": -17.455629819399352,
"logps/rejected": -83.4837749435241,
"loss": 0.1944,
"rewards/chosen": 0.07330032447715859,
"rewards/margins": 6.58620889234237,
"rewards/rejected": -6.512908567865211,
"step": 470
},
{
"epoch": 8.533333333333333,
"grad_norm": 37.684791564941406,
"kl": 0.0,
"learning_rate": 3.044460665744284e-07,
"logits/chosen": -5911329.28,
"logits/rejected": -5294802.070588236,
"logps/chosen": -13.222509765625,
"logps/rejected": -76.97578699448529,
"loss": 0.2056,
"rewards/chosen": 0.3842613474527995,
"rewards/margins": 6.094753649842505,
"rewards/rejected": -5.710492302389706,
"step": 480
},
{
"epoch": 8.71111111111111,
"grad_norm": 0.48345357179641724,
"kl": 0.0,
"learning_rate": 2.3423053240837518e-07,
"logits/chosen": -3852471.8139534886,
"logits/rejected": -6658902.486486486,
"logps/chosen": -9.945970402207486,
"logps/rejected": -84.33938846072635,
"loss": 0.1847,
"rewards/chosen": 0.8006199681481649,
"rewards/margins": 7.142369004782456,
"rewards/rejected": -6.34174903663429,
"step": 490
},
{
"epoch": 8.88888888888889,
"grad_norm": 0.2579200565814972,
"kl": 0.0,
"learning_rate": 1.7281562838948968e-07,
"logits/chosen": -3453992.727272727,
"logits/rejected": -7513944.0,
"logps/chosen": -10.801272305575283,
"logps/rejected": -81.59269205729167,
"loss": 0.208,
"rewards/chosen": 0.5914750532670454,
"rewards/margins": 6.682477623525292,
"rewards/rejected": -6.091002570258246,
"step": 500
},
{
"epoch": 8.88888888888889,
"eval_logits/chosen": -5270992.0,
"eval_logits/rejected": -6459799.04,
"eval_logps/chosen": -22.9960888671875,
"eval_logps/rejected": -75.276025390625,
"eval_loss": 0.2732886075973511,
"eval_rewards/chosen": -0.7733412170410157,
"eval_rewards/margins": 4.80520004272461,
"eval_rewards/rejected": -5.578541259765625,
"eval_runtime": 20.8559,
"eval_samples_per_second": 4.795,
"eval_steps_per_second": 2.397,
"kl": 0.0,
"step": 500
},
{
"epoch": 9.066666666666666,
"grad_norm": 0.3732987344264984,
"kl": 0.0,
"learning_rate": 1.2043990034669413e-07,
"logits/chosen": -4063886.1298701297,
"logits/rejected": -6919009.927710843,
"logps/chosen": -19.0400485744724,
"logps/rejected": -83.49184629141567,
"loss": 0.2051,
"rewards/chosen": -0.03847099898697494,
"rewards/margins": 6.430230467033356,
"rewards/rejected": -6.468701466020331,
"step": 510
},
{
"epoch": 9.244444444444444,
"grad_norm": 0.415623277425766,
"kl": 0.0,
"learning_rate": 7.730678442730539e-08,
"logits/chosen": -5067307.52,
"logits/rejected": -6418687.247058824,
"logps/chosen": -11.464226888020834,
"logps/rejected": -86.0582950367647,
"loss": 0.1587,
"rewards/chosen": 0.5690560913085938,
"rewards/margins": 7.230119251924403,
"rewards/rejected": -6.661063160615809,
"step": 520
},
{
"epoch": 9.422222222222222,
"grad_norm": 2.298677444458008,
"kl": 0.0,
"learning_rate": 4.358381691677932e-08,
"logits/chosen": -4109652.0,
"logits/rejected": -6422994.4,
"logps/chosen": -9.813970184326172,
"logps/rejected": -85.676953125,
"loss": 0.1562,
"rewards/chosen": 0.9679355621337891,
"rewards/margins": 7.5916393280029295,
"rewards/rejected": -6.6237037658691404,
"step": 530
},
{
"epoch": 9.6,
"grad_norm": 0.9799414873123169,
"kl": 0.38483619689941406,
"learning_rate": 1.9401983499569843e-08,
"logits/chosen": -4599832.788732395,
"logits/rejected": -5707845.0337078655,
"logps/chosen": -11.476158464458626,
"logps/rejected": -83.63685042134831,
"loss": 0.1776,
"rewards/chosen": 0.3927964492582939,
"rewards/margins": 6.871875106429994,
"rewards/rejected": -6.4790786571717,
"step": 540
},
{
"epoch": 9.777777777777779,
"grad_norm": 0.49998390674591064,
"kl": 0.0,
"learning_rate": 4.855210488670381e-09,
"logits/chosen": -4236334.577777778,
"logits/rejected": -4783881.142857143,
"logps/chosen": -5.939041476779514,
"logps/rejected": -75.12473493303571,
"loss": 0.2075,
"rewards/chosen": 1.0517437405056425,
"rewards/margins": 6.599594867040241,
"rewards/rejected": -5.547851126534598,
"step": 550
},
{
"epoch": 9.777777777777779,
"eval_logits/chosen": -5283820.8,
"eval_logits/rejected": -6485849.6,
"eval_logps/chosen": -23.082109375,
"eval_logps/rejected": -74.8675,
"eval_loss": 0.27660998702049255,
"eval_rewards/chosen": -0.7819430541992187,
"eval_rewards/margins": 4.755745544433594,
"eval_rewards/rejected": -5.537688598632813,
"eval_runtime": 20.8339,
"eval_samples_per_second": 4.8,
"eval_steps_per_second": 2.4,
"kl": 0.0,
"step": 550
},
{
"epoch": 9.955555555555556,
"grad_norm": 0.7372793555259705,
"kl": 0.0,
"learning_rate": 0.0,
"logits/chosen": -3234109.7721518986,
"logits/rejected": -6989616.197530864,
"logps/chosen": -16.822625655162184,
"logps/rejected": -82.41786024305556,
"loss": 0.2168,
"rewards/chosen": -0.01685871655427957,
"rewards/margins": 6.099000326896723,
"rewards/rejected": -6.115859043451003,
"step": 560
},
{
"epoch": 9.955555555555556,
"step": 560,
"total_flos": 5.004660760510464e+16,
"train_loss": 0.2794176772236824,
"train_runtime": 3263.9168,
"train_samples_per_second": 2.757,
"train_steps_per_second": 0.172
}
],
"logging_steps": 10,
"max_steps": 560,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.004660760510464e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}