n19 / trainer_state.json
mssqpi's picture
Upload folder using huggingface_hub
6ca0087 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 294,
"global_step": 2931,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01023541453428864,
"grad_norm": 43.49986783253336,
"kl": 2.608715057373047,
"learning_rate": 3.0612244897959183e-08,
"logits/chosen": -111473206.85714285,
"logits/rejected": -94626835.6923077,
"logps/chosen": -652.2265625,
"logps/rejected": -566.5637394831731,
"loss": 0.4991,
"rewards/chosen": -0.04828752790178571,
"rewards/margins": -0.045037640811322804,
"rewards/rejected": -0.003249887090462905,
"step": 10
},
{
"epoch": 0.02047082906857728,
"grad_norm": 45.377383375399816,
"kl": 2.1035537719726562,
"learning_rate": 6.462585034013606e-08,
"logits/chosen": -83623772.44444445,
"logits/rejected": -98599098.18181819,
"logps/chosen": -535.5240885416666,
"logps/rejected": -588.3898703835227,
"loss": 0.5002,
"rewards/chosen": -0.012379965848392911,
"rewards/margins": -0.00647647934730607,
"rewards/rejected": -0.005903486501086842,
"step": 20
},
{
"epoch": 0.030706243602865915,
"grad_norm": 47.6297142496506,
"kl": 2.7048912048339844,
"learning_rate": 9.863945578231292e-08,
"logits/chosen": -84807603.2,
"logits/rejected": -71051987.2,
"logps/chosen": -515.9376953125,
"logps/rejected": -514.024072265625,
"loss": 0.4975,
"rewards/chosen": -0.024640909830729165,
"rewards/margins": -0.002495519320170083,
"rewards/rejected": -0.022145390510559082,
"step": 30
},
{
"epoch": 0.04094165813715456,
"grad_norm": 36.8985081490464,
"kl": 5.254791259765625,
"learning_rate": 1.326530612244898e-07,
"logits/chosen": -85094592.0,
"logits/rejected": -87958234.66666667,
"logps/chosen": -537.9602748325893,
"logps/rejected": -530.6398111979166,
"loss": 0.5002,
"rewards/chosen": 0.024391608578818186,
"rewards/margins": 0.03762454007353101,
"rewards/rejected": -0.01323293149471283,
"step": 40
},
{
"epoch": 0.0511770726714432,
"grad_norm": 48.90887685320883,
"kl": 5.289024353027344,
"learning_rate": 1.6666666666666665e-07,
"logits/chosen": -78688878.54545455,
"logits/rejected": -93538062.22222222,
"logps/chosen": -549.6178089488636,
"logps/rejected": -589.7821180555555,
"loss": 0.5033,
"rewards/chosen": 0.023246071555397728,
"rewards/margins": -0.07650775198984627,
"rewards/rejected": 0.09975382354524401,
"step": 50
},
{
"epoch": 0.06141248720573183,
"grad_norm": 36.308193875487795,
"kl": 12.417892456054688,
"learning_rate": 2.0068027210884352e-07,
"logits/chosen": -83240362.66666667,
"logits/rejected": -82516282.18181819,
"logps/chosen": -506.6652018229167,
"logps/rejected": -571.6751598011364,
"loss": 0.5017,
"rewards/chosen": 0.02451240022977193,
"rewards/margins": -0.017813264420538242,
"rewards/rejected": 0.04232566465031017,
"step": 60
},
{
"epoch": 0.07164790174002048,
"grad_norm": 40.69660777082527,
"kl": 13.569828033447266,
"learning_rate": 2.346938775510204e-07,
"logits/chosen": -89204087.46666667,
"logits/rejected": -104327052.8,
"logps/chosen": -570.499609375,
"logps/rejected": -574.176953125,
"loss": 0.491,
"rewards/chosen": 0.10849486986796061,
"rewards/margins": 0.2052929679552714,
"rewards/rejected": -0.0967980980873108,
"step": 70
},
{
"epoch": 0.08188331627430911,
"grad_norm": 36.95157131931197,
"kl": 23.946407318115234,
"learning_rate": 2.6870748299319727e-07,
"logits/chosen": -109718818.13333334,
"logits/rejected": -87821907.2,
"logps/chosen": -617.5989583333334,
"logps/rejected": -588.38564453125,
"loss": 0.4978,
"rewards/chosen": 0.1503196080525716,
"rewards/margins": -0.09428855578104656,
"rewards/rejected": 0.24460816383361816,
"step": 80
},
{
"epoch": 0.09211873080859775,
"grad_norm": 46.4163800004006,
"kl": 62.34700012207031,
"learning_rate": 3.027210884353741e-07,
"logits/chosen": -103054848.0,
"logits/rejected": -76245773.71428572,
"logps/chosen": -603.8435246394231,
"logps/rejected": -516.7438616071429,
"loss": 0.4851,
"rewards/chosen": 0.3449526566725511,
"rewards/margins": -0.10476050534091152,
"rewards/rejected": 0.4497131620134626,
"step": 90
},
{
"epoch": 0.1023541453428864,
"grad_norm": 40.60547971708711,
"kl": 119.17515563964844,
"learning_rate": 3.3673469387755096e-07,
"logits/chosen": -95468101.81818181,
"logits/rejected": -94606791.1111111,
"logps/chosen": -601.3514293323864,
"logps/rejected": -620.5993381076389,
"loss": 0.4698,
"rewards/chosen": 0.7123452099886808,
"rewards/margins": 0.039795061554571576,
"rewards/rejected": 0.6725501484341092,
"step": 100
},
{
"epoch": 0.11258955987717502,
"grad_norm": 35.710378804036566,
"kl": 290.5523681640625,
"learning_rate": 3.707482993197279e-07,
"logits/chosen": -108210918.4,
"logits/rejected": -91392499.2,
"logps/chosen": -645.35595703125,
"logps/rejected": -562.3923828125,
"loss": 0.4728,
"rewards/chosen": 1.9680000305175782,
"rewards/margins": 0.7570903778076172,
"rewards/rejected": 1.210909652709961,
"step": 110
},
{
"epoch": 0.12282497441146366,
"grad_norm": 35.17890953445531,
"kl": 451.8232421875,
"learning_rate": 4.0476190476190476e-07,
"logits/chosen": -88438312.0,
"logits/rejected": -109754261.33333333,
"logps/chosen": -518.6763305664062,
"logps/rejected": -589.44970703125,
"loss": 0.466,
"rewards/chosen": 2.5440354347229004,
"rewards/margins": 0.2261904080708823,
"rewards/rejected": 2.317845026652018,
"step": 120
},
{
"epoch": 0.1330603889457523,
"grad_norm": 31.330141726965156,
"kl": 530.52392578125,
"learning_rate": 4.387755102040816e-07,
"logits/chosen": -100719965.0909091,
"logits/rejected": -100037198.22222222,
"logps/chosen": -594.1492365056819,
"logps/rejected": -589.1796875,
"loss": 0.4351,
"rewards/chosen": 3.291921788995916,
"rewards/margins": 0.4270220477171618,
"rewards/rejected": 2.8648997412787542,
"step": 130
},
{
"epoch": 0.14329580348004095,
"grad_norm": 23.86281232214143,
"kl": 658.0179443359375,
"learning_rate": 4.727891156462585e-07,
"logits/chosen": -115303606.85714285,
"logits/rejected": -109218453.33333333,
"logps/chosen": -617.2470703125,
"logps/rejected": -546.180908203125,
"loss": 0.4174,
"rewards/chosen": 3.440274919782366,
"rewards/margins": 0.4147660391671315,
"rewards/rejected": 3.0255088806152344,
"step": 140
},
{
"epoch": 0.1535312180143296,
"grad_norm": 29.55019035252469,
"kl": 849.1378784179688,
"learning_rate": 5.068027210884354e-07,
"logits/chosen": -104089250.9090909,
"logits/rejected": -112913493.33333333,
"logps/chosen": -506.69753196022725,
"logps/rejected": -624.7750651041666,
"loss": 0.4663,
"rewards/chosen": 3.6730131669477983,
"rewards/margins": -0.8608655833234686,
"rewards/rejected": 4.533878750271267,
"step": 150
},
{
"epoch": 0.16376663254861823,
"grad_norm": 24.964919672196842,
"kl": 875.5402221679688,
"learning_rate": 5.408163265306123e-07,
"logits/chosen": -109994830.76923077,
"logits/rejected": -111798601.14285715,
"logps/chosen": -581.7682542067307,
"logps/rejected": -567.9554268973214,
"loss": 0.4383,
"rewards/chosen": 5.3036322960486775,
"rewards/margins": 1.3674162141569366,
"rewards/rejected": 3.936216081891741,
"step": 160
},
{
"epoch": 0.17400204708290687,
"grad_norm": 26.396407490554235,
"kl": 900.7223510742188,
"learning_rate": 5.748299319727891e-07,
"logits/chosen": -125637876.36363636,
"logits/rejected": -111796778.66666667,
"logps/chosen": -707.3591974431819,
"logps/rejected": -564.7646484375,
"loss": 0.4225,
"rewards/chosen": 6.323025790127841,
"rewards/margins": 1.465928164395419,
"rewards/rejected": 4.857097625732422,
"step": 170
},
{
"epoch": 0.1842374616171955,
"grad_norm": 26.688093990242596,
"kl": 968.2679443359375,
"learning_rate": 6.08843537414966e-07,
"logits/chosen": -106460544.0,
"logits/rejected": -99920104.72727273,
"logps/chosen": -548.9823133680555,
"logps/rejected": -486.6614435369318,
"loss": 0.3959,
"rewards/chosen": 5.817420111762153,
"rewards/margins": 2.0305458608299793,
"rewards/rejected": 3.7868742509321733,
"step": 180
},
{
"epoch": 0.19447287615148415,
"grad_norm": 27.514737576204514,
"kl": 1100.53125,
"learning_rate": 6.428571428571429e-07,
"logits/chosen": -104961415.1111111,
"logits/rejected": -99523467.63636364,
"logps/chosen": -518.9440104166666,
"logps/rejected": -469.3260387073864,
"loss": 0.4233,
"rewards/chosen": 6.698972066243489,
"rewards/margins": 0.8934900688402578,
"rewards/rejected": 5.8054819974032315,
"step": 190
},
{
"epoch": 0.2047082906857728,
"grad_norm": 28.306342866049548,
"kl": 1226.0400390625,
"learning_rate": 6.768707482993196e-07,
"logits/chosen": -98903750.4,
"logits/rejected": -109158988.8,
"logps/chosen": -489.16083984375,
"logps/rejected": -544.397265625,
"loss": 0.3861,
"rewards/chosen": 7.531886291503906,
"rewards/margins": 1.526143264770508,
"rewards/rejected": 6.005743026733398,
"step": 200
},
{
"epoch": 0.21494370522006143,
"grad_norm": 22.693614623739084,
"kl": 1270.7314453125,
"learning_rate": 7.108843537414966e-07,
"logits/chosen": -113987630.54545455,
"logits/rejected": -114975089.77777778,
"logps/chosen": -541.0110973011364,
"logps/rejected": -538.138671875,
"loss": 0.4139,
"rewards/chosen": 7.440266002308238,
"rewards/margins": -0.12881392661971347,
"rewards/rejected": 7.569079928927952,
"step": 210
},
{
"epoch": 0.22517911975435004,
"grad_norm": 23.947122487878186,
"kl": 1415.864013671875,
"learning_rate": 7.448979591836734e-07,
"logits/chosen": -96022882.9090909,
"logits/rejected": -130673493.33333333,
"logps/chosen": -429.89302201704544,
"logps/rejected": -592.8015407986111,
"loss": 0.3978,
"rewards/chosen": 7.7172019264914775,
"rewards/margins": 2.46611855246804,
"rewards/rejected": 5.2510833740234375,
"step": 220
},
{
"epoch": 0.23541453428863868,
"grad_norm": 24.063075761950447,
"kl": 1236.5908203125,
"learning_rate": 7.789115646258503e-07,
"logits/chosen": -111414570.66666667,
"logits/rejected": -93740104.0,
"logps/chosen": -510.4249674479167,
"logps/rejected": -424.00146484375,
"loss": 0.4325,
"rewards/chosen": 7.313229878743489,
"rewards/margins": 1.4464839299519854,
"rewards/rejected": 5.866745948791504,
"step": 230
},
{
"epoch": 0.24564994882292732,
"grad_norm": 20.088674283050615,
"kl": 1163.108154296875,
"learning_rate": 8.129251700680271e-07,
"logits/chosen": -109586229.33333333,
"logits/rejected": -133773656.0,
"logps/chosen": -519.625244140625,
"logps/rejected": -560.62890625,
"loss": 0.3586,
"rewards/chosen": 6.922650655110677,
"rewards/margins": 2.392770131429036,
"rewards/rejected": 4.529880523681641,
"step": 240
},
{
"epoch": 0.25588536335721596,
"grad_norm": 25.835106855876596,
"kl": 1395.358154296875,
"learning_rate": 8.469387755102041e-07,
"logits/chosen": -112743603.2,
"logits/rejected": -99526758.4,
"logps/chosen": -546.212255859375,
"logps/rejected": -472.45087890625,
"loss": 0.3533,
"rewards/chosen": 8.94365997314453,
"rewards/margins": 3.0286102294921866,
"rewards/rejected": 5.915049743652344,
"step": 250
},
{
"epoch": 0.2661207778915046,
"grad_norm": 28.67885947302261,
"kl": 1351.502197265625,
"learning_rate": 8.809523809523809e-07,
"logits/chosen": -123957152.0,
"logits/rejected": -107065344.0,
"logps/chosen": -517.8416748046875,
"logps/rejected": -519.6959635416666,
"loss": 0.4139,
"rewards/chosen": 8.116106033325195,
"rewards/margins": 0.3912080128987627,
"rewards/rejected": 7.724898020426433,
"step": 260
},
{
"epoch": 0.27635619242579323,
"grad_norm": 23.569658518425406,
"kl": 1269.64306640625,
"learning_rate": 9.149659863945578e-07,
"logits/chosen": -133106278.4,
"logits/rejected": -113169587.2,
"logps/chosen": -585.38896484375,
"logps/rejected": -489.866357421875,
"loss": 0.3802,
"rewards/chosen": 8.878982543945312,
"rewards/margins": 1.278940582275391,
"rewards/rejected": 7.6000419616699215,
"step": 270
},
{
"epoch": 0.2865916069600819,
"grad_norm": 24.265479157836495,
"kl": 877.0639038085938,
"learning_rate": 9.489795918367347e-07,
"logits/chosen": -113836704.0,
"logits/rejected": -111877589.33333333,
"logps/chosen": -503.03131103515625,
"logps/rejected": -492.7080891927083,
"loss": 0.3226,
"rewards/chosen": 6.913208484649658,
"rewards/margins": 3.9872498512268066,
"rewards/rejected": 2.9259586334228516,
"step": 280
},
{
"epoch": 0.2968270214943705,
"grad_norm": 33.06706771664489,
"kl": 492.48126220703125,
"learning_rate": 9.829931972789116e-07,
"logits/chosen": -122592484.57142857,
"logits/rejected": -120002619.07692307,
"logps/chosen": -527.830322265625,
"logps/rejected": -702.9601862980769,
"loss": 0.3762,
"rewards/chosen": 5.435535975864956,
"rewards/margins": 7.473937066046746,
"rewards/rejected": -2.038401090181791,
"step": 290
},
{
"epoch": 0.300921187308086,
"eval_logits/chosen": -110518608.0,
"eval_logits/rejected": -102539296.0,
"eval_logps/chosen": -475.5417785644531,
"eval_logps/rejected": -533.520751953125,
"eval_loss": 0.5446844696998596,
"eval_rewards/chosen": 2.4218292236328125,
"eval_rewards/margins": -0.5885589122772217,
"eval_rewards/rejected": 3.010388135910034,
"eval_runtime": 2.6473,
"eval_samples_per_second": 3.777,
"eval_steps_per_second": 0.755,
"kl": 0.0,
"step": 294
},
{
"epoch": 0.3070624360286592,
"grad_norm": 18.659418545479838,
"kl": 6.161457061767578,
"learning_rate": 9.999911292933214e-07,
"logits/chosen": -106471222.85714285,
"logits/rejected": -111757961.84615384,
"logps/chosen": -547.7110421316964,
"logps/rejected": -634.5023287259615,
"loss": 0.447,
"rewards/chosen": 0.06852419035775321,
"rewards/margins": 2.988240658581912,
"rewards/rejected": -2.9197164682241588,
"step": 300
},
{
"epoch": 0.3172978505629478,
"grad_norm": 18.96738462683121,
"kl": 300.7326965332031,
"learning_rate": 9.999201655284278e-07,
"logits/chosen": -110882067.6923077,
"logits/rejected": -105352630.85714285,
"logps/chosen": -500.43941556490387,
"logps/rejected": -635.9861886160714,
"loss": 0.3072,
"rewards/chosen": 3.4353617154634914,
"rewards/margins": 6.811538549569937,
"rewards/rejected": -3.3761768341064453,
"step": 310
},
{
"epoch": 0.32753326509723646,
"grad_norm": 11.374422004887663,
"kl": 203.40391540527344,
"learning_rate": 9.99778248070531e-07,
"logits/chosen": -131503383.27272727,
"logits/rejected": -111262620.44444445,
"logps/chosen": -657.5584161931819,
"logps/rejected": -765.3725043402778,
"loss": 0.3116,
"rewards/chosen": 3.251959367231889,
"rewards/margins": 18.295085791385535,
"rewards/rejected": -15.043126424153646,
"step": 320
},
{
"epoch": 0.33776867963152507,
"grad_norm": 8.880038655785679,
"kl": 295.427490234375,
"learning_rate": 9.995653970619826e-07,
"logits/chosen": -106187031.27272727,
"logits/rejected": -103581155.55555555,
"logps/chosen": -590.5241477272727,
"logps/rejected": -587.0112847222222,
"loss": 0.3363,
"rewards/chosen": 3.493159207430753,
"rewards/margins": 6.228314948804451,
"rewards/rejected": -2.7351557413736978,
"step": 330
},
{
"epoch": 0.34800409416581374,
"grad_norm": 5.40509067733252,
"kl": 468.18304443359375,
"learning_rate": 9.992816427127367e-07,
"logits/chosen": -99738496.0,
"logits/rejected": -99705779.2,
"logps/chosen": -506.1044921875,
"logps/rejected": -543.95732421875,
"loss": 0.3102,
"rewards/chosen": 5.911396789550781,
"rewards/margins": 5.55134220123291,
"rewards/rejected": 0.36005458831787107,
"step": 340
},
{
"epoch": 0.35823950870010235,
"grad_norm": 12.913253729889696,
"kl": 725.096923828125,
"learning_rate": 9.989270252960613e-07,
"logits/chosen": -109946088.0,
"logits/rejected": -102954101.33333333,
"logps/chosen": -531.6331787109375,
"logps/rejected": -548.4167073567709,
"loss": 0.2884,
"rewards/chosen": 7.9849534034729,
"rewards/margins": 7.94753082593282,
"rewards/rejected": 0.03742257754007975,
"step": 350
},
{
"epoch": 0.368474923234391,
"grad_norm": 17.735001258432412,
"kl": 360.2090759277344,
"learning_rate": 9.985015951428235e-07,
"logits/chosen": -96365765.81818181,
"logits/rejected": -106907882.66666667,
"logps/chosen": -506.93399325284093,
"logps/rejected": -564.6844618055555,
"loss": 0.2752,
"rewards/chosen": 6.11878065629439,
"rewards/margins": 4.105176810062293,
"rewards/rejected": 2.013603846232096,
"step": 360
},
{
"epoch": 0.37871033776867963,
"grad_norm": 18.86741420861504,
"kl": 373.96728515625,
"learning_rate": 9.980054126343455e-07,
"logits/chosen": -112412480.0,
"logits/rejected": -97450692.26666667,
"logps/chosen": -504.6830078125,
"logps/rejected": -691.042578125,
"loss": 0.3076,
"rewards/chosen": 6.367465209960938,
"rewards/margins": 20.507525634765624,
"rewards/rejected": -14.140060424804688,
"step": 370
},
{
"epoch": 0.3889457523029683,
"grad_norm": 10.094878631245667,
"kl": 196.72659301757812,
"learning_rate": 9.97438548193834e-07,
"logits/chosen": -105118729.14285715,
"logits/rejected": -110734293.33333333,
"logps/chosen": -562.8646065848214,
"logps/rejected": -507.587646484375,
"loss": 0.2493,
"rewards/chosen": 4.5860748291015625,
"rewards/margins": 0.42298221588134766,
"rewards/rejected": 4.163092613220215,
"step": 380
},
{
"epoch": 0.3991811668372569,
"grad_norm": 14.627865457817679,
"kl": 279.072265625,
"learning_rate": 9.968010822763865e-07,
"logits/chosen": -113412138.66666667,
"logits/rejected": -133443886.54545455,
"logps/chosen": -530.6067708333334,
"logps/rejected": -751.8836115056819,
"loss": 0.3747,
"rewards/chosen": 5.105476379394531,
"rewards/margins": 12.318459944291547,
"rewards/rejected": -7.212983564897017,
"step": 390
},
{
"epoch": 0.4094165813715456,
"grad_norm": 19.572916091667736,
"kl": 460.6169738769531,
"learning_rate": 9.960931053575709e-07,
"logits/chosen": -110601969.77777778,
"logits/rejected": -108428951.27272727,
"logps/chosen": -532.2218967013889,
"logps/rejected": -531.6844815340909,
"loss": 0.35,
"rewards/chosen": 3.8675689697265625,
"rewards/margins": 5.184099370783025,
"rewards/rejected": -1.3165304010564631,
"step": 400
},
{
"epoch": 0.4196519959058342,
"grad_norm": 20.088088174480013,
"kl": 221.86827087402344,
"learning_rate": 9.953147179205854e-07,
"logits/chosen": -129331342.22222222,
"logits/rejected": -110869527.27272727,
"logps/chosen": -582.8767361111111,
"logps/rejected": -650.5072798295455,
"loss": 0.3309,
"rewards/chosen": 4.793890211317274,
"rewards/margins": 14.806629913021819,
"rewards/rejected": -10.012739701704545,
"step": 410
},
{
"epoch": 0.42988741044012285,
"grad_norm": 10.346910287143617,
"kl": 415.3928527832031,
"learning_rate": 9.94466030441996e-07,
"logits/chosen": -110768944.0,
"logits/rejected": -111097237.33333333,
"logps/chosen": -549.23974609375,
"logps/rejected": -712.3189290364584,
"loss": 0.3102,
"rewards/chosen": 6.6238861083984375,
"rewards/margins": 17.687956492106117,
"rewards/rejected": -11.064070383707682,
"step": 420
},
{
"epoch": 0.44012282497441146,
"grad_norm": 18.314103572328413,
"kl": 576.5167236328125,
"learning_rate": 9.935471633760572e-07,
"logits/chosen": -99995520.0,
"logits/rejected": -107673870.22222222,
"logps/chosen": -487.12202592329544,
"logps/rejected": -492.6575520833333,
"loss": 0.4035,
"rewards/chosen": 6.976482044566762,
"rewards/margins": 4.1658927840415885,
"rewards/rejected": 2.8105892605251737,
"step": 430
},
{
"epoch": 0.4503582395087001,
"grad_norm": 11.724627840974088,
"kl": 1106.9119873046875,
"learning_rate": 9.925582471376154e-07,
"logits/chosen": -108020854.15384616,
"logits/rejected": -109625874.28571428,
"logps/chosen": -473.66616586538464,
"logps/rejected": -555.1595284598214,
"loss": 0.3728,
"rewards/chosen": 7.7546832744891825,
"rewards/margins": 3.9347364404699303,
"rewards/rejected": 3.819946834019252,
"step": 440
},
{
"epoch": 0.46059365404298874,
"grad_norm": 15.613591939966279,
"kl": 1362.2752685546875,
"learning_rate": 9.914994220836e-07,
"logits/chosen": -117927030.15384616,
"logits/rejected": -106152228.57142857,
"logps/chosen": -495.19174429086536,
"logps/rejected": -458.72154017857144,
"loss": 0.3465,
"rewards/chosen": 9.081056448129507,
"rewards/margins": 3.9735811673677883,
"rewards/rejected": 5.107475280761719,
"step": 450
},
{
"epoch": 0.47082906857727735,
"grad_norm": 11.952485164591582,
"kl": 1285.5869140625,
"learning_rate": 9.903708384931013e-07,
"logits/chosen": -120327202.13333334,
"logits/rejected": -109135692.8,
"logps/chosen": -505.6738606770833,
"logps/rejected": -466.63486328125,
"loss": 0.385,
"rewards/chosen": 10.037539672851562,
"rewards/margins": 4.661623382568359,
"rewards/rejected": 5.375916290283203,
"step": 460
},
{
"epoch": 0.481064483111566,
"grad_norm": 0.12365071345843295,
"kl": 682.0918579101562,
"learning_rate": 9.891726565460422e-07,
"logits/chosen": -112774152.0,
"logits/rejected": -112093888.0,
"logps/chosen": -538.9781494140625,
"logps/rejected": -575.06591796875,
"loss": 0.3106,
"rewards/chosen": 8.146060943603516,
"rewards/margins": 8.388625462849935,
"rewards/rejected": -0.24256451924641928,
"step": 470
},
{
"epoch": 0.49129989764585463,
"grad_norm": 0.49092307811834135,
"kl": 450.83636474609375,
"learning_rate": 9.87905046300444e-07,
"logits/chosen": -114120941.71428572,
"logits/rejected": -114062867.6923077,
"logps/chosen": -527.6252092633929,
"logps/rejected": -601.4497445913462,
"loss": 0.2638,
"rewards/chosen": 8.766914367675781,
"rewards/margins": 13.61475078876202,
"rewards/rejected": -4.847836421086238,
"step": 480
},
{
"epoch": 0.5015353121801432,
"grad_norm": 6.1800940127591195,
"kl": 483.6259765625,
"learning_rate": 9.865681876682896e-07,
"logits/chosen": -145298368.0,
"logits/rejected": -108289821.53846154,
"logps/chosen": -661.1333705357143,
"logps/rejected": -580.3282376802885,
"loss": 0.3479,
"rewards/chosen": 6.970576695033482,
"rewards/margins": 2.642127152327652,
"rewards/rejected": 4.32844954270583,
"step": 490
},
{
"epoch": 0.5117707267144319,
"grad_norm": 7.945808526296434,
"kl": 107.43032836914062,
"learning_rate": 9.851622703899882e-07,
"logits/chosen": -121188147.2,
"logits/rejected": -111221376.0,
"logps/chosen": -597.09716796875,
"logps/rejected": -687.855078125,
"loss": 0.3365,
"rewards/chosen": 3.832713317871094,
"rewards/margins": 14.021282196044922,
"rewards/rejected": -10.188568878173829,
"step": 500
},
{
"epoch": 0.5220061412487206,
"grad_norm": 11.356989950783646,
"kl": 150.3419189453125,
"learning_rate": 9.836874940074464e-07,
"logits/chosen": -101973013.33333333,
"logits/rejected": -103617608.0,
"logps/chosen": -522.2017415364584,
"logps/rejected": -547.4072265625,
"loss": 0.3154,
"rewards/chosen": 5.501302083333333,
"rewards/margins": 3.550793608029683,
"rewards/rejected": 1.95050847530365,
"step": 510
},
{
"epoch": 0.5322415557830092,
"grad_norm": 15.04108069791104,
"kl": 167.46121215820312,
"learning_rate": 9.821440678357468e-07,
"logits/chosen": -96939017.14285715,
"logits/rejected": -111715840.0,
"logps/chosen": -514.41259765625,
"logps/rejected": -753.7920673076923,
"loss": 0.3735,
"rewards/chosen": 6.49347904750279,
"rewards/margins": 17.330997592800266,
"rewards/rejected": -10.837518545297476,
"step": 520
},
{
"epoch": 0.5424769703172978,
"grad_norm": 24.162543374076332,
"kl": 312.23114013671875,
"learning_rate": 9.8053221093344e-07,
"logits/chosen": -116499040.0,
"logits/rejected": -100585819.42857143,
"logps/chosen": -541.9063313802084,
"logps/rejected": -666.6215122767857,
"loss": 0.3326,
"rewards/chosen": 4.888221104939778,
"rewards/margins": 12.125807535080682,
"rewards/rejected": -7.237586430140904,
"step": 530
},
{
"epoch": 0.5527123848515865,
"grad_norm": 12.602052991862804,
"kl": 267.88934326171875,
"learning_rate": 9.788521520714529e-07,
"logits/chosen": -92977206.85714285,
"logits/rejected": -93523712.0,
"logps/chosen": -519.3472726004464,
"logps/rejected": -672.4755108173077,
"loss": 0.341,
"rewards/chosen": 6.53789302280971,
"rewards/margins": 20.28299847277966,
"rewards/rejected": -13.745105449969952,
"step": 540
},
{
"epoch": 0.5629477993858751,
"grad_norm": 8.979067508659034,
"kl": 238.48565673828125,
"learning_rate": 9.7710412970062e-07,
"logits/chosen": -93921649.77777778,
"logits/rejected": -94202391.27272727,
"logps/chosen": -538.5509982638889,
"logps/rejected": -825.6182528409091,
"loss": 0.3459,
"rewards/chosen": 5.394065856933594,
"rewards/margins": 30.139568675648082,
"rewards/rejected": -24.74550281871449,
"step": 550
},
{
"epoch": 0.5731832139201638,
"grad_norm": 13.93301876621068,
"kl": 451.12652587890625,
"learning_rate": 9.752883919178408e-07,
"logits/chosen": -93620721.77777778,
"logits/rejected": -119429352.72727273,
"logps/chosen": -447.1330295138889,
"logps/rejected": -851.3874289772727,
"loss": 0.3262,
"rewards/chosen": 6.1410704718695746,
"rewards/margins": 23.947242235896564,
"rewards/rejected": -17.80617176402699,
"step": 560
},
{
"epoch": 0.5834186284544524,
"grad_norm": 4.890619149638879,
"kl": 97.303955078125,
"learning_rate": 9.734051964308648e-07,
"logits/chosen": -101164160.0,
"logits/rejected": -95843181.71428572,
"logps/chosen": -528.27490234375,
"logps/rejected": -700.330078125,
"loss": 0.284,
"rewards/chosen": 7.194744990422175,
"rewards/margins": 15.150297772753369,
"rewards/rejected": -7.9555527823311945,
"step": 570
},
{
"epoch": 0.593654042988741,
"grad_norm": 11.746526426847675,
"kl": 96.44719696044922,
"learning_rate": 9.71454810521718e-07,
"logits/chosen": -100481490.28571428,
"logits/rejected": -102850837.33333333,
"logps/chosen": -567.1158272879464,
"logps/rejected": -688.10302734375,
"loss": 0.327,
"rewards/chosen": 3.4819068908691406,
"rewards/margins": 17.827465057373047,
"rewards/rejected": -14.345558166503906,
"step": 580
},
{
"epoch": 0.601842374616172,
"eval_logits/chosen": -98040168.0,
"eval_logits/rejected": -91529280.0,
"eval_logps/chosen": -425.20477294921875,
"eval_logps/rejected": -455.2176208496094,
"eval_loss": 0.39909082651138306,
"eval_rewards/chosen": 7.455529689788818,
"eval_rewards/margins": -3.385171413421631,
"eval_rewards/rejected": 10.84070110321045,
"eval_runtime": 2.934,
"eval_samples_per_second": 3.408,
"eval_steps_per_second": 0.682,
"kl": 0.0,
"step": 588
},
{
"epoch": 0.6038894575230297,
"grad_norm": 11.508747631852,
"kl": 189.71697998046875,
"learning_rate": 9.694375110087653e-07,
"logits/chosen": -105231475.2,
"logits/rejected": -98110668.8,
"logps/chosen": -517.33330078125,
"logps/rejected": -650.870703125,
"loss": 0.3037,
"rewards/chosen": 7.2950927734375,
"rewards/margins": 11.132262039184571,
"rewards/rejected": -3.8371692657470704,
"step": 590
},
{
"epoch": 0.6141248720573184,
"grad_norm": 8.003411922362675,
"kl": 246.2487030029297,
"learning_rate": 9.673535842074236e-07,
"logits/chosen": -89631707.42857143,
"logits/rejected": -93188706.46153846,
"logps/chosen": -533.9011579241071,
"logps/rejected": -591.8625300480769,
"loss": 0.3143,
"rewards/chosen": 4.597275870186942,
"rewards/margins": 8.995123852740278,
"rewards/rejected": -4.397847982553335,
"step": 600
},
{
"epoch": 0.6243602865916069,
"grad_norm": 13.51479152454202,
"kl": 251.9799041748047,
"learning_rate": 9.65203325889523e-07,
"logits/chosen": -103712000.0,
"logits/rejected": -102217077.33333333,
"logps/chosen": -554.9147251674107,
"logps/rejected": -660.7069498697916,
"loss": 0.2897,
"rewards/chosen": 3.735581534249442,
"rewards/margins": 9.694771902901785,
"rewards/rejected": -5.959190368652344,
"step": 610
},
{
"epoch": 0.6345957011258956,
"grad_norm": 6.752419583328531,
"kl": 284.58343505859375,
"learning_rate": 9.6298704124133e-07,
"logits/chosen": -112582016.0,
"logits/rejected": -102245866.66666667,
"logps/chosen": -558.93798828125,
"logps/rejected": -657.9363606770834,
"loss": 0.2812,
"rewards/chosen": 5.288269996643066,
"rewards/margins": 13.41814390818278,
"rewards/rejected": -8.129873911539713,
"step": 620
},
{
"epoch": 0.6448311156601843,
"grad_norm": 7.266588839401468,
"kl": 580.995361328125,
"learning_rate": 9.607050448202303e-07,
"logits/chosen": -92270584.8888889,
"logits/rejected": -102551435.63636364,
"logps/chosen": -440.3317599826389,
"logps/rejected": -712.0537109375,
"loss": 0.2915,
"rewards/chosen": 8.482827080620659,
"rewards/margins": 19.49976086857343,
"rewards/rejected": -11.01693378795277,
"step": 630
},
{
"epoch": 0.6550665301944729,
"grad_norm": 7.2744015718609205,
"kl": 340.7578125,
"learning_rate": 9.583576605100849e-07,
"logits/chosen": -101343896.0,
"logits/rejected": -84657386.66666667,
"logps/chosen": -561.446533203125,
"logps/rejected": -587.9672037760416,
"loss": 0.3113,
"rewards/chosen": 6.773608207702637,
"rewards/margins": 9.901070276896158,
"rewards/rejected": -3.127462069193522,
"step": 640
},
{
"epoch": 0.6653019447287615,
"grad_norm": 8.836525928002317,
"kl": 427.6199645996094,
"learning_rate": 9.559452214752618e-07,
"logits/chosen": -105170837.33333333,
"logits/rejected": -86878112.0,
"logps/chosen": -552.1053059895834,
"logps/rejected": -517.5369873046875,
"loss": 0.2568,
"rewards/chosen": 6.057671864827474,
"rewards/margins": 3.1388653119405108,
"rewards/rejected": 2.918806552886963,
"step": 650
},
{
"epoch": 0.6755373592630501,
"grad_norm": 13.148384194030669,
"kl": 565.3815307617188,
"learning_rate": 9.53468070113348e-07,
"logits/chosen": -112625361.45454545,
"logits/rejected": -100674417.77777778,
"logps/chosen": -521.5106090198864,
"logps/rejected": -619.4135199652778,
"loss": 0.3215,
"rewards/chosen": 6.586706681685015,
"rewards/margins": 7.943774637549815,
"rewards/rejected": -1.3570679558648004,
"step": 660
},
{
"epoch": 0.6857727737973388,
"grad_norm": 19.435897057149237,
"kl": 780.356689453125,
"learning_rate": 9.50926558006555e-07,
"logits/chosen": -95798784.0,
"logits/rejected": -105156501.33333333,
"logps/chosen": -470.95458984375,
"logps/rejected": -546.5327555338541,
"loss": 0.3182,
"rewards/chosen": 7.692968368530273,
"rewards/margins": 2.9158140818277998,
"rewards/rejected": 4.777154286702474,
"step": 670
},
{
"epoch": 0.6960081883316275,
"grad_norm": 6.833848958642979,
"kl": 473.15252685546875,
"learning_rate": 9.483210458718179e-07,
"logits/chosen": -118425624.0,
"logits/rejected": -96726122.66666667,
"logps/chosen": -520.1289672851562,
"logps/rejected": -606.3455403645834,
"loss": 0.3185,
"rewards/chosen": 7.276991844177246,
"rewards/margins": 8.136903285980225,
"rewards/rejected": -0.8599114418029785,
"step": 680
},
{
"epoch": 0.706243602865916,
"grad_norm": 14.808552507419774,
"kl": 146.22113037109375,
"learning_rate": 9.456519035095981e-07,
"logits/chosen": -107473191.38461539,
"logits/rejected": -101956937.14285715,
"logps/chosen": -526.8605769230769,
"logps/rejected": -668.9007393973214,
"loss": 0.2883,
"rewards/chosen": 7.2536163330078125,
"rewards/margins": 11.410586220877512,
"rewards/rejected": -4.156969887869699,
"step": 690
},
{
"epoch": 0.7164790174002047,
"grad_norm": 24.1373966304863,
"kl": 37.00183868408203,
"learning_rate": 9.429195097513992e-07,
"logits/chosen": -105830352.0,
"logits/rejected": -106361685.33333333,
"logps/chosen": -576.9627685546875,
"logps/rejected": -804.2159830729166,
"loss": 0.3142,
"rewards/chosen": 2.0549275875091553,
"rewards/margins": 19.464907089869182,
"rewards/rejected": -17.409979502360027,
"step": 700
},
{
"epoch": 0.7267144319344934,
"grad_norm": 8.881119400084598,
"kl": 153.50912475585938,
"learning_rate": 9.401242524059977e-07,
"logits/chosen": -104918784.0,
"logits/rejected": -106944930.9090909,
"logps/chosen": -621.583984375,
"logps/rejected": -713.2195490056819,
"loss": 0.2933,
"rewards/chosen": 1.9484386444091797,
"rewards/margins": 12.224429910833186,
"rewards/rejected": -10.275991266424006,
"step": 710
},
{
"epoch": 0.736949846468782,
"grad_norm": 15.249343262390562,
"kl": 181.8312530517578,
"learning_rate": 9.372665282044024e-07,
"logits/chosen": -112600905.14285715,
"logits/rejected": -99067864.61538461,
"logps/chosen": -519.7294921875,
"logps/rejected": -749.6340144230769,
"loss": 0.2071,
"rewards/chosen": 7.2293581281389505,
"rewards/margins": 27.119649782285585,
"rewards/rejected": -19.890291654146633,
"step": 720
},
{
"epoch": 0.7471852610030706,
"grad_norm": 16.243100773857645,
"kl": 254.5145263671875,
"learning_rate": 9.343467427435461e-07,
"logits/chosen": -88804677.81818181,
"logits/rejected": -105656291.55555555,
"logps/chosen": -501.12753018465907,
"logps/rejected": -622.5223524305555,
"loss": 0.2931,
"rewards/chosen": 2.286927830089222,
"rewards/margins": 9.008850810503718,
"rewards/rejected": -6.721922980414496,
"step": 730
},
{
"epoch": 0.7574206755373593,
"grad_norm": 13.717272461313408,
"kl": 251.88658142089844,
"learning_rate": 9.313653104287186e-07,
"logits/chosen": -102378560.0,
"logits/rejected": -96828368.0,
"logps/chosen": -588.1607259114584,
"logps/rejected": -709.3660888671875,
"loss": 0.3341,
"rewards/chosen": 2.007448355356852,
"rewards/margins": 10.696151892344156,
"rewards/rejected": -8.688703536987305,
"step": 740
},
{
"epoch": 0.7676560900716479,
"grad_norm": 11.54276353096192,
"kl": 315.0760192871094,
"learning_rate": 9.283226544147511e-07,
"logits/chosen": -86456040.72727273,
"logits/rejected": -96371100.44444445,
"logps/chosen": -448.26313920454544,
"logps/rejected": -643.1417100694445,
"loss": 0.3983,
"rewards/chosen": 7.560623862526634,
"rewards/margins": 11.595734336159445,
"rewards/rejected": -4.0351104736328125,
"step": 750
},
{
"epoch": 0.7778915046059366,
"grad_norm": 15.08405881286112,
"kl": 487.782470703125,
"learning_rate": 9.25219206545957e-07,
"logits/chosen": -98477909.33333333,
"logits/rejected": -93243504.0,
"logps/chosen": -528.0136311848959,
"logps/rejected": -646.53369140625,
"loss": 0.2999,
"rewards/chosen": 7.208832422892253,
"rewards/margins": 12.962552229563396,
"rewards/rejected": -5.753719806671143,
"step": 760
},
{
"epoch": 0.7881269191402251,
"grad_norm": 10.02364890514971,
"kl": 337.864013671875,
"learning_rate": 9.220554072948411e-07,
"logits/chosen": -86742869.33333333,
"logits/rejected": -99716196.57142857,
"logps/chosen": -465.5155436197917,
"logps/rejected": -689.5123465401786,
"loss": 0.3449,
"rewards/chosen": 7.739498138427734,
"rewards/margins": 17.27605492728097,
"rewards/rejected": -9.536556788853236,
"step": 770
},
{
"epoch": 0.7983623336745138,
"grad_norm": 6.84043498795176,
"kl": 173.30726623535156,
"learning_rate": 9.188317056995821e-07,
"logits/chosen": -97626282.66666667,
"logits/rejected": -92453771.63636364,
"logps/chosen": -546.7721896701389,
"logps/rejected": -680.9408735795455,
"loss": 0.2973,
"rewards/chosen": 5.306654188368055,
"rewards/margins": 12.666858711627999,
"rewards/rejected": -7.360204523259943,
"step": 780
},
{
"epoch": 0.8085977482088025,
"grad_norm": 13.851012784720913,
"kl": 245.4700927734375,
"learning_rate": 9.155485593003018e-07,
"logits/chosen": -85085882.18181819,
"logits/rejected": -89022186.66666667,
"logps/chosen": -476.38858309659093,
"logps/rejected": -872.1040581597222,
"loss": 0.2803,
"rewards/chosen": 5.146501020951704,
"rewards/margins": 33.767214996646146,
"rewards/rejected": -28.620713975694443,
"step": 790
},
{
"epoch": 0.8188331627430911,
"grad_norm": 13.213804383427497,
"kl": 134.34815979003906,
"learning_rate": 9.122064340741255e-07,
"logits/chosen": -106269207.27272727,
"logits/rejected": -83881678.22222222,
"logps/chosen": -550.1131480823864,
"logps/rejected": -748.6202256944445,
"loss": 0.2973,
"rewards/chosen": 5.797416687011719,
"rewards/margins": 30.07076432969835,
"rewards/rejected": -24.27334764268663,
"step": 800
},
{
"epoch": 0.8290685772773797,
"grad_norm": 8.051808004953356,
"kl": 60.33879089355469,
"learning_rate": 9.088058043690465e-07,
"logits/chosen": -116116224.0,
"logits/rejected": -96582818.9090909,
"logps/chosen": -650.5863715277778,
"logps/rejected": -727.5021306818181,
"loss": 0.3047,
"rewards/chosen": 4.034843444824219,
"rewards/margins": 18.15105576948686,
"rewards/rejected": -14.116212324662643,
"step": 810
},
{
"epoch": 0.8393039918116684,
"grad_norm": 4.835531934200326,
"kl": 320.6475524902344,
"learning_rate": 9.053471528366017e-07,
"logits/chosen": -123069098.66666667,
"logits/rejected": -99107821.71428572,
"logps/chosen": -613.021728515625,
"logps/rejected": -663.7785295758929,
"loss": 0.2865,
"rewards/chosen": 5.482795715332031,
"rewards/margins": 18.49122510637556,
"rewards/rejected": -13.008429391043526,
"step": 820
},
{
"epoch": 0.849539406345957,
"grad_norm": 6.763920280816717,
"kl": 119.12086486816406,
"learning_rate": 9.01830970363368e-07,
"logits/chosen": -120392115.2,
"logits/rejected": -101945299.2,
"logps/chosen": -546.945947265625,
"logps/rejected": -929.22958984375,
"loss": 0.2935,
"rewards/chosen": 5.795610046386718,
"rewards/margins": 33.776902770996095,
"rewards/rejected": -27.981292724609375,
"step": 830
},
{
"epoch": 0.8597748208802457,
"grad_norm": 16.695156760816847,
"kl": 396.1942443847656,
"learning_rate": 8.982577560012924e-07,
"logits/chosen": -92570790.4,
"logits/rejected": -96133427.2,
"logps/chosen": -500.14462890625,
"logps/rejected": -747.961962890625,
"loss": 0.3157,
"rewards/chosen": 6.134347915649414,
"rewards/margins": 16.947478103637696,
"rewards/rejected": -10.813130187988282,
"step": 840
},
{
"epoch": 0.8700102354145343,
"grad_norm": 8.721135056770576,
"kl": 161.04820251464844,
"learning_rate": 8.9462801689686e-07,
"logits/chosen": -111414489.6,
"logits/rejected": -94172275.2,
"logps/chosen": -513.11845703125,
"logps/rejected": -592.3296875,
"loss": 0.2634,
"rewards/chosen": 7.548574066162109,
"rewards/margins": 10.237901115417479,
"rewards/rejected": -2.689327049255371,
"step": 850
},
{
"epoch": 0.8802456499488229,
"grad_norm": 23.6450296971921,
"kl": 120.66998291015625,
"learning_rate": 8.909422682191157e-07,
"logits/chosen": -108347946.66666667,
"logits/rejected": -96643232.0,
"logps/chosen": -650.9365234375,
"logps/rejected": -971.8366088867188,
"loss": 0.3753,
"rewards/chosen": 1.5685276985168457,
"rewards/margins": 38.76629304885864,
"rewards/rejected": -37.1977653503418,
"step": 860
},
{
"epoch": 0.8904810644831116,
"grad_norm": 10.184595791078122,
"kl": 55.834754943847656,
"learning_rate": 8.872010330865454e-07,
"logits/chosen": -118076885.33333333,
"logits/rejected": -120020640.0,
"logps/chosen": -600.4386800130209,
"logps/rejected": -1235.656494140625,
"loss": 0.3306,
"rewards/chosen": 3.8476082483927407,
"rewards/margins": 56.71857992808024,
"rewards/rejected": -52.8709716796875,
"step": 870
},
{
"epoch": 0.9007164790174002,
"grad_norm": 15.000452794185478,
"kl": 166.8241424560547,
"learning_rate": 8.834048424928304e-07,
"logits/chosen": -84017722.18181819,
"logits/rejected": -92456405.33333333,
"logps/chosen": -500.98464133522725,
"logps/rejected": -590.1988389756945,
"loss": 0.3939,
"rewards/chosen": 3.811018857088956,
"rewards/margins": 5.707060091423266,
"rewards/rejected": -1.8960412343343098,
"step": 880
},
{
"epoch": 0.9027635619242579,
"eval_logits/chosen": -88125560.0,
"eval_logits/rejected": -82095856.0,
"eval_logps/chosen": -429.337646484375,
"eval_logps/rejected": -468.053466796875,
"eval_loss": 0.20316681265830994,
"eval_rewards/chosen": 7.042242527008057,
"eval_rewards/margins": -2.51487398147583,
"eval_rewards/rejected": 9.557116508483887,
"eval_runtime": 2.6369,
"eval_samples_per_second": 3.792,
"eval_steps_per_second": 0.758,
"kl": 0.0,
"step": 882
},
{
"epoch": 0.9109518935516888,
"grad_norm": 13.183651143904488,
"kl": 134.21334838867188,
"learning_rate": 8.795542352314834e-07,
"logits/chosen": -94884726.15384616,
"logits/rejected": -93538349.71428572,
"logps/chosen": -486.8059645432692,
"logps/rejected": -642.7673688616071,
"loss": 0.31,
"rewards/chosen": 7.5120063194861775,
"rewards/margins": 10.829019651308164,
"rewards/rejected": -3.3170133318219865,
"step": 890
},
{
"epoch": 0.9211873080859775,
"grad_norm": 12.212132136986225,
"kl": 207.9207763671875,
"learning_rate": 8.756497578193771e-07,
"logits/chosen": -83985936.0,
"logits/rejected": -85384976.0,
"logps/chosen": -493.8150329589844,
"logps/rejected": -741.118408203125,
"loss": 0.3121,
"rewards/chosen": 3.521029472351074,
"rewards/margins": 21.653578758239746,
"rewards/rejected": -18.132549285888672,
"step": 900
},
{
"epoch": 0.9314227226202662,
"grad_norm": 15.869594800463931,
"kl": 121.78730773925781,
"learning_rate": 8.716919644191773e-07,
"logits/chosen": -95207923.2,
"logits/rejected": -86504256.0,
"logps/chosen": -502.17255859375,
"logps/rejected": -570.52373046875,
"loss": 0.3842,
"rewards/chosen": 4.867946624755859,
"rewards/margins": 8.839313507080078,
"rewards/rejected": -3.9713668823242188,
"step": 910
},
{
"epoch": 0.9416581371545547,
"grad_norm": 11.625516820508704,
"kl": 26.68120574951172,
"learning_rate": 8.676814167606905e-07,
"logits/chosen": -94190675.2,
"logits/rejected": -74575884.8,
"logps/chosen": -573.2962890625,
"logps/rejected": -981.627734375,
"loss": 0.2716,
"rewards/chosen": 3.0543357849121096,
"rewards/margins": 48.8309928894043,
"rewards/rejected": -45.77665710449219,
"step": 920
},
{
"epoch": 0.9518935516888434,
"grad_norm": 11.337702446083828,
"kl": 94.01974487304688,
"learning_rate": 8.636186840611379e-07,
"logits/chosen": -75281489.45454545,
"logits/rejected": -80548579.55555555,
"logps/chosen": -506.4801580255682,
"logps/rejected": -948.1449652777778,
"loss": 0.3049,
"rewards/chosen": 3.306485262784091,
"rewards/margins": 38.26877231790562,
"rewards/rejected": -34.96228705512153,
"step": 930
},
{
"epoch": 0.962128966223132,
"grad_norm": 9.210148269471487,
"kl": 10.692268371582031,
"learning_rate": 8.595043429443657e-07,
"logits/chosen": -81648140.8,
"logits/rejected": -68831852.8,
"logps/chosen": -525.10361328125,
"logps/rejected": -681.321337890625,
"loss": 0.3209,
"rewards/chosen": 4.949716567993164,
"rewards/margins": 18.163093948364256,
"rewards/rejected": -13.213377380371094,
"step": 940
},
{
"epoch": 0.9723643807574207,
"grad_norm": 15.353110855422312,
"kl": 127.87919616699219,
"learning_rate": 8.553389773590054e-07,
"logits/chosen": -79860096.0,
"logits/rejected": -79998992.0,
"logps/chosen": -505.05023193359375,
"logps/rejected": -634.8770751953125,
"loss": 0.2912,
"rewards/chosen": 3.4712483882904053,
"rewards/margins": 12.555377721786499,
"rewards/rejected": -9.084129333496094,
"step": 950
},
{
"epoch": 0.9825997952917093,
"grad_norm": 13.869616858822004,
"kl": 238.24578857421875,
"learning_rate": 8.511231784955937e-07,
"logits/chosen": -88956788.36363636,
"logits/rejected": -103397703.1111111,
"logps/chosen": -548.9154829545455,
"logps/rejected": -788.0484483506945,
"loss": 0.3187,
"rewards/chosen": 4.906893643465909,
"rewards/margins": 20.047172777580492,
"rewards/rejected": -15.140279134114584,
"step": 960
},
{
"epoch": 0.9928352098259979,
"grad_norm": 7.268319539608931,
"kl": 338.30377197265625,
"learning_rate": 8.468575447026651e-07,
"logits/chosen": -101875840.0,
"logits/rejected": -87527862.85714285,
"logps/chosen": -622.6335261418269,
"logps/rejected": -595.9942801339286,
"loss": 0.2935,
"rewards/chosen": 6.616310706505408,
"rewards/margins": 11.137901348072093,
"rewards/rejected": -4.521590641566685,
"step": 970
},
{
"epoch": 1.0030706243602865,
"grad_norm": 11.76521234747505,
"kl": 199.67869567871094,
"learning_rate": 8.425426814018276e-07,
"logits/chosen": -89589788.44444445,
"logits/rejected": -80169029.81818181,
"logps/chosen": -493.0221896701389,
"logps/rejected": -716.8766424005681,
"loss": 0.3217,
"rewards/chosen": 7.745334201388889,
"rewards/margins": 21.7819885870423,
"rewards/rejected": -14.036654385653408,
"step": 980
},
{
"epoch": 1.0133060388945752,
"grad_norm": 8.474284377746724,
"kl": 244.9769744873047,
"learning_rate": 8.381792010018361e-07,
"logits/chosen": -94540608.0,
"logits/rejected": -89885610.66666667,
"logps/chosen": -535.6998845880681,
"logps/rejected": -803.9326714409722,
"loss": 0.2194,
"rewards/chosen": 5.931960365988991,
"rewards/margins": 25.050717825841424,
"rewards/rejected": -19.118757459852432,
"step": 990
},
{
"epoch": 1.0235414534288638,
"grad_norm": 12.815547513288609,
"kl": 179.80715942382812,
"learning_rate": 8.33767722811672e-07,
"logits/chosen": -89090510.22222222,
"logits/rejected": -94376587.63636364,
"logps/chosen": -540.0843641493055,
"logps/rejected": -727.6770241477273,
"loss": 0.2687,
"rewards/chosen": 4.669386969672309,
"rewards/margins": 16.463514000478416,
"rewards/rejected": -11.794127030806107,
"step": 1000
},
{
"epoch": 1.0337768679631525,
"grad_norm": 6.109931782123331,
"kl": 160.75302124023438,
"learning_rate": 8.293088729526465e-07,
"logits/chosen": -82976885.33333333,
"logits/rejected": -86193800.0,
"logps/chosen": -501.4635823567708,
"logps/rejected": -1245.8173828125,
"loss": 0.2226,
"rewards/chosen": 5.3093001047770185,
"rewards/margins": 75.92017046610515,
"rewards/rejected": -70.61087036132812,
"step": 1010
},
{
"epoch": 1.0440122824974412,
"grad_norm": 15.654981917234721,
"kl": 116.26859283447266,
"learning_rate": 8.24803284269533e-07,
"logits/chosen": -81231587.55555555,
"logits/rejected": -93158120.72727273,
"logps/chosen": -519.5434027777778,
"logps/rejected": -1056.223544034091,
"loss": 0.2267,
"rewards/chosen": 4.975582546657986,
"rewards/margins": 50.14659195716935,
"rewards/rejected": -45.17100941051137,
"step": 1020
},
{
"epoch": 1.0542476970317298,
"grad_norm": 6.75360490081935,
"kl": 89.63571166992188,
"learning_rate": 8.202515962407484e-07,
"logits/chosen": -82426867.2,
"logits/rejected": -97263526.4,
"logps/chosen": -495.3529296875,
"logps/rejected": -915.62001953125,
"loss": 0.2569,
"rewards/chosen": 4.025004577636719,
"rewards/margins": 30.261656188964842,
"rewards/rejected": -26.236651611328124,
"step": 1030
},
{
"epoch": 1.0644831115660185,
"grad_norm": 8.435457326936826,
"kl": 65.1689682006836,
"learning_rate": 8.156544548875929e-07,
"logits/chosen": -94793408.0,
"logits/rejected": -82803456.0,
"logps/chosen": -615.216552734375,
"logps/rejected": -890.8118489583334,
"loss": 0.2582,
"rewards/chosen": 4.597088813781738,
"rewards/margins": 39.764737129211426,
"rewards/rejected": -35.16764831542969,
"step": 1040
},
{
"epoch": 1.0747185261003072,
"grad_norm": 10.185636259451332,
"kl": 41.50421142578125,
"learning_rate": 8.110125126825586e-07,
"logits/chosen": -94445606.4,
"logits/rejected": -81221324.8,
"logps/chosen": -530.09130859375,
"logps/rejected": -720.1796875,
"loss": 0.1988,
"rewards/chosen": 2.9237091064453127,
"rewards/margins": 18.941264851888022,
"rewards/rejected": -16.01755574544271,
"step": 1050
},
{
"epoch": 1.0849539406345956,
"grad_norm": 14.367610693803071,
"kl": 40.293880462646484,
"learning_rate": 8.063264284567244e-07,
"logits/chosen": -88462400.0,
"logits/rejected": -84809104.0,
"logps/chosen": -556.0542805989584,
"logps/rejected": -790.8518676757812,
"loss": 0.2299,
"rewards/chosen": 2.747189521789551,
"rewards/margins": 25.718636512756348,
"rewards/rejected": -22.971446990966797,
"step": 1060
},
{
"epoch": 1.0951893551688843,
"grad_norm": 7.867680484103394,
"kl": 84.09693908691406,
"learning_rate": 8.015968673062485e-07,
"logits/chosen": -97079057.45454545,
"logits/rejected": -113056576.0,
"logps/chosen": -652.1316583806819,
"logps/rejected": -871.9325086805555,
"loss": 0.2491,
"rewards/chosen": 3.3613832647150215,
"rewards/margins": 30.61995531332613,
"rewards/rejected": -27.25857204861111,
"step": 1070
},
{
"epoch": 1.105424769703173,
"grad_norm": 18.500289346264474,
"kl": 178.9644317626953,
"learning_rate": 7.968245004979715e-07,
"logits/chosen": -88232459.63636364,
"logits/rejected": -97237361.77777778,
"logps/chosen": -494.20854048295456,
"logps/rejected": -620.2445203993055,
"loss": 0.2429,
"rewards/chosen": 4.08733506636186,
"rewards/margins": 12.414174089528093,
"rewards/rejected": -8.326839023166233,
"step": 1080
},
{
"epoch": 1.1156601842374616,
"grad_norm": 12.84885943054635,
"kl": 122.49982452392578,
"learning_rate": 7.920100053741426e-07,
"logits/chosen": -93443669.33333333,
"logits/rejected": -100065931.63636364,
"logps/chosen": -518.4554036458334,
"logps/rejected": -726.6989524147727,
"loss": 0.2324,
"rewards/chosen": 2.174501207139757,
"rewards/margins": 17.892823961046005,
"rewards/rejected": -15.71832275390625,
"step": 1090
},
{
"epoch": 1.1258955987717503,
"grad_norm": 19.387059089946195,
"kl": 27.290096282958984,
"learning_rate": 7.87154065256285e-07,
"logits/chosen": -92360000.0,
"logits/rejected": -116687132.44444445,
"logps/chosen": -506.4671519886364,
"logps/rejected": -787.19140625,
"loss": 0.2213,
"rewards/chosen": 4.497406352650035,
"rewards/margins": 22.27773743446427,
"rewards/rejected": -17.780331081814236,
"step": 1100
},
{
"epoch": 1.136131013306039,
"grad_norm": 13.175723781566907,
"kl": 73.90142822265625,
"learning_rate": 7.822573693482119e-07,
"logits/chosen": -97808337.45454545,
"logits/rejected": -93331783.1111111,
"logps/chosen": -509.45339133522725,
"logps/rejected": -572.677734375,
"loss": 0.2823,
"rewards/chosen": 4.765384674072266,
"rewards/margins": 7.263773176405165,
"rewards/rejected": -2.498388502332899,
"step": 1110
},
{
"epoch": 1.1463664278403276,
"grad_norm": 13.791443161319787,
"kl": 29.938060760498047,
"learning_rate": 7.773206126382077e-07,
"logits/chosen": -101143435.63636364,
"logits/rejected": -114077838.22222222,
"logps/chosen": -584.0867365056819,
"logps/rejected": -804.4205729166666,
"loss": 0.2454,
"rewards/chosen": 3.9651097384366123,
"rewards/margins": 18.79806329746439,
"rewards/rejected": -14.832953559027779,
"step": 1120
},
{
"epoch": 1.156601842374616,
"grad_norm": 8.034234943256347,
"kl": 116.21412658691406,
"learning_rate": 7.723444958003882e-07,
"logits/chosen": -116664576.0,
"logits/rejected": -97974736.0,
"logps/chosen": -587.37646484375,
"logps/rejected": -664.1363525390625,
"loss": 0.2012,
"rewards/chosen": 5.379718780517578,
"rewards/margins": 13.820740699768066,
"rewards/rejected": -8.441021919250488,
"step": 1130
},
{
"epoch": 1.1668372569089047,
"grad_norm": 22.76694207719624,
"kl": 9.00653076171875,
"learning_rate": 7.673297250952547e-07,
"logits/chosen": -100386534.4,
"logits/rejected": -109167040.0,
"logps/chosen": -535.891455078125,
"logps/rejected": -875.89404296875,
"loss": 0.2078,
"rewards/chosen": 3.430181121826172,
"rewards/margins": 24.129712677001955,
"rewards/rejected": -20.69953155517578,
"step": 1140
},
{
"epoch": 1.1770726714431934,
"grad_norm": 12.60561797855466,
"kl": 11.566696166992188,
"learning_rate": 7.622770122694525e-07,
"logits/chosen": -127401927.1111111,
"logits/rejected": -114918923.63636364,
"logps/chosen": -666.6869574652778,
"logps/rejected": -823.8343394886364,
"loss": 0.2521,
"rewards/chosen": 2.1344752841525607,
"rewards/margins": 23.581607741538924,
"rewards/rejected": -21.447132457386363,
"step": 1150
},
{
"epoch": 1.187308085977482,
"grad_norm": 13.853587264944288,
"kl": 42.4306755065918,
"learning_rate": 7.571870744547551e-07,
"logits/chosen": -109880817.77777778,
"logits/rejected": -106957009.45454545,
"logps/chosen": -593.1612413194445,
"logps/rejected": -905.1737393465909,
"loss": 0.2524,
"rewards/chosen": 2.1737113528781467,
"rewards/margins": 30.0016698933611,
"rewards/rejected": -27.827958540482953,
"step": 1160
},
{
"epoch": 1.1975435005117707,
"grad_norm": 15.543850269979604,
"kl": 110.62580108642578,
"learning_rate": 7.520606340662798e-07,
"logits/chosen": -96752089.6,
"logits/rejected": -93654444.8,
"logps/chosen": -498.780810546875,
"logps/rejected": -706.256103515625,
"loss": 0.2188,
"rewards/chosen": 5.138518905639648,
"rewards/margins": 18.503916549682614,
"rewards/rejected": -13.365397644042968,
"step": 1170
},
{
"epoch": 1.203684749232344,
"eval_logits/chosen": -105911152.0,
"eval_logits/rejected": -95925920.0,
"eval_logps/chosen": -484.4088134765625,
"eval_logps/rejected": -496.7947998046875,
"eval_loss": 0.39017611742019653,
"eval_rewards/chosen": 1.535125732421875,
"eval_rewards/margins": -5.147857666015625,
"eval_rewards/rejected": 6.6829833984375,
"eval_runtime": 2.6237,
"eval_samples_per_second": 3.811,
"eval_steps_per_second": 0.762,
"kl": 0.0,
"step": 1176
},
{
"epoch": 1.2077789150460594,
"grad_norm": 10.926577914323278,
"kl": 134.444091796875,
"learning_rate": 7.468984186999565e-07,
"logits/chosen": -93668560.0,
"logits/rejected": -102340970.66666667,
"logps/chosen": -475.44390869140625,
"logps/rejected": -876.3225911458334,
"loss": 0.2985,
"rewards/chosen": 1.7625150680541992,
"rewards/margins": 29.492424329121906,
"rewards/rejected": -27.729909261067707,
"step": 1180
},
{
"epoch": 1.218014329580348,
"grad_norm": 17.09063024418297,
"kl": 101.04781341552734,
"learning_rate": 7.417011610292584e-07,
"logits/chosen": -129169689.6,
"logits/rejected": -107744477.86666666,
"logps/chosen": -637.96298828125,
"logps/rejected": -853.0078776041667,
"loss": 0.1699,
"rewards/chosen": 1.2093938827514648,
"rewards/margins": 29.540536053975423,
"rewards/rejected": -28.33114217122396,
"step": 1190
},
{
"epoch": 1.2282497441146367,
"grad_norm": 10.4739549796137,
"kl": 80.91563415527344,
"learning_rate": 7.364695987012156e-07,
"logits/chosen": -100979484.44444445,
"logits/rejected": -102875240.72727273,
"logps/chosen": -516.0002712673611,
"logps/rejected": -780.2686434659091,
"loss": 0.2704,
"rewards/chosen": 2.6132854885525174,
"rewards/margins": 26.03951617924854,
"rewards/rejected": -23.426230690696023,
"step": 1200
},
{
"epoch": 1.2384851586489254,
"grad_norm": 15.645731487799601,
"kl": 55.83991622924805,
"learning_rate": 7.312044742317196e-07,
"logits/chosen": -97166316.8,
"logits/rejected": -98332864.0,
"logps/chosen": -473.275732421875,
"logps/rejected": -872.339453125,
"loss": 0.2458,
"rewards/chosen": 5.7388427734375,
"rewards/margins": 40.456884765625,
"rewards/rejected": -34.7180419921875,
"step": 1210
},
{
"epoch": 1.2487205731832138,
"grad_norm": 16.544369267846214,
"kl": 26.200450897216797,
"learning_rate": 7.259065349001381e-07,
"logits/chosen": -112301428.36363636,
"logits/rejected": -91687751.1111111,
"logps/chosen": -603.98046875,
"logps/rejected": -697.6657986111111,
"loss": 0.2189,
"rewards/chosen": 4.487701416015625,
"rewards/margins": 19.04924519856771,
"rewards/rejected": -14.561543782552084,
"step": 1220
},
{
"epoch": 1.2589559877175025,
"grad_norm": 12.68572044575919,
"kl": 198.68328857421875,
"learning_rate": 7.205765326432538e-07,
"logits/chosen": -104480836.92307693,
"logits/rejected": -114156653.71428572,
"logps/chosen": -595.9436598557693,
"logps/rejected": -1002.3537946428571,
"loss": 0.296,
"rewards/chosen": 5.6437542255108175,
"rewards/margins": 30.03105716914921,
"rewards/rejected": -24.387302943638392,
"step": 1230
},
{
"epoch": 1.2691914022517912,
"grad_norm": 14.327102372681413,
"kl": 131.150390625,
"learning_rate": 7.152152239485418e-07,
"logits/chosen": -92621104.0,
"logits/rejected": -94707562.66666667,
"logps/chosen": -522.0101318359375,
"logps/rejected": -597.5576171875,
"loss": 0.2498,
"rewards/chosen": 4.805232524871826,
"rewards/margins": 9.411616484324139,
"rewards/rejected": -4.6063839594523115,
"step": 1240
},
{
"epoch": 1.2794268167860798,
"grad_norm": 13.696519258504198,
"kl": 120.63490295410156,
"learning_rate": 7.098233697468019e-07,
"logits/chosen": -91027916.8,
"logits/rejected": -120194508.8,
"logps/chosen": -560.667919921875,
"logps/rejected": -722.74677734375,
"loss": 0.2297,
"rewards/chosen": 4.605225372314453,
"rewards/margins": 18.276374053955077,
"rewards/rejected": -13.671148681640625,
"step": 1250
},
{
"epoch": 1.2896622313203685,
"grad_norm": 19.84542195273479,
"kl": 238.61802673339844,
"learning_rate": 7.044017353041585e-07,
"logits/chosen": -123477056.0,
"logits/rejected": -79627398.4,
"logps/chosen": -695.36806640625,
"logps/rejected": -575.853515625,
"loss": 0.2209,
"rewards/chosen": -6.707785034179688,
"rewards/margins": 1.1494865417480469,
"rewards/rejected": -7.857271575927735,
"step": 1260
},
{
"epoch": 1.2998976458546572,
"grad_norm": 17.108004492145188,
"kl": 22.916343688964844,
"learning_rate": 6.989510901134477e-07,
"logits/chosen": -104629414.4,
"logits/rejected": -99776633.6,
"logps/chosen": -606.61474609375,
"logps/rejected": -840.0265625,
"loss": 0.2224,
"rewards/chosen": 4.991733551025391,
"rewards/margins": 33.285465240478516,
"rewards/rejected": -28.293731689453125,
"step": 1270
},
{
"epoch": 1.3101330603889458,
"grad_norm": 10.36059136074515,
"kl": 116.60319519042969,
"learning_rate": 6.934722077850016e-07,
"logits/chosen": -100436208.0,
"logits/rejected": -115627882.66666667,
"logps/chosen": -515.864501953125,
"logps/rejected": -724.6964518229166,
"loss": 0.2796,
"rewards/chosen": 4.3812665939331055,
"rewards/margins": 12.6353546778361,
"rewards/rejected": -8.254088083902994,
"step": 1280
},
{
"epoch": 1.3203684749232343,
"grad_norm": 15.48356653719983,
"kl": 45.89117431640625,
"learning_rate": 6.879658659368514e-07,
"logits/chosen": -91997213.0909091,
"logits/rejected": -89794709.33333333,
"logps/chosen": -473.80996981534093,
"logps/rejected": -531.3462456597222,
"loss": 0.2376,
"rewards/chosen": 4.85120287808505,
"rewards/margins": 6.749144891295771,
"rewards/rejected": -1.8979420132107205,
"step": 1290
},
{
"epoch": 1.330603889457523,
"grad_norm": 12.868008283397653,
"kl": 18.55701446533203,
"learning_rate": 6.82432846084359e-07,
"logits/chosen": -100941725.53846154,
"logits/rejected": -99222765.71428572,
"logps/chosen": -621.4768254206731,
"logps/rejected": -828.6552734375,
"loss": 0.2153,
"rewards/chosen": 1.7661338219275842,
"rewards/margins": 25.749048337831603,
"rewards/rejected": -23.982914515904017,
"step": 1300
},
{
"epoch": 1.3408393039918116,
"grad_norm": 2.2800239418596466,
"kl": 89.49375915527344,
"learning_rate": 6.768739335292968e-07,
"logits/chosen": -121310284.8,
"logits/rejected": -93642886.4,
"logps/chosen": -556.091015625,
"logps/rejected": -601.05625,
"loss": 0.1707,
"rewards/chosen": 4.573309326171875,
"rewards/margins": 14.642904663085938,
"rewards/rejected": -10.069595336914062,
"step": 1310
},
{
"epoch": 1.3510747185261003,
"grad_norm": 15.366769443574192,
"kl": 293.1350402832031,
"learning_rate": 6.712899172483892e-07,
"logits/chosen": -91668277.33333333,
"logits/rejected": -110515696.0,
"logps/chosen": -507.600341796875,
"logps/rejected": -879.2127685546875,
"loss": 0.2528,
"rewards/chosen": 5.978892644246419,
"rewards/margins": 22.380460103352863,
"rewards/rejected": -16.401567459106445,
"step": 1320
},
{
"epoch": 1.361310133060389,
"grad_norm": 12.823756571517078,
"kl": 88.99737548828125,
"learning_rate": 6.656815897813345e-07,
"logits/chosen": -87199751.1111111,
"logits/rejected": -95049268.36363636,
"logps/chosen": -489.8304036458333,
"logps/rejected": -755.6834161931819,
"loss": 0.1953,
"rewards/chosen": 5.404658423529731,
"rewards/margins": 20.248757988515525,
"rewards/rejected": -14.844099564985795,
"step": 1330
},
{
"epoch": 1.3715455475946776,
"grad_norm": 4.283693497606417,
"kl": 163.53549194335938,
"learning_rate": 6.600497471183179e-07,
"logits/chosen": -101366438.4,
"logits/rejected": -88560857.6,
"logps/chosen": -491.186083984375,
"logps/rejected": -717.26845703125,
"loss": 0.208,
"rewards/chosen": 6.666817474365234,
"rewards/margins": 24.17246780395508,
"rewards/rejected": -17.505650329589844,
"step": 1340
},
{
"epoch": 1.3817809621289663,
"grad_norm": 7.001179043778539,
"kl": 66.92013549804688,
"learning_rate": 6.543951885870382e-07,
"logits/chosen": -95844366.22222222,
"logits/rejected": -96672808.72727273,
"logps/chosen": -510.3117404513889,
"logps/rejected": -747.4904119318181,
"loss": 0.1907,
"rewards/chosen": 6.773768530951606,
"rewards/margins": 23.975265002009845,
"rewards/rejected": -17.20149647105824,
"step": 1350
},
{
"epoch": 1.3920163766632547,
"grad_norm": 14.848116150242822,
"kl": 87.44967651367188,
"learning_rate": 6.48718716739258e-07,
"logits/chosen": -107372951.27272727,
"logits/rejected": -94606008.8888889,
"logps/chosen": -541.4000355113636,
"logps/rejected": -566.193359375,
"loss": 0.2408,
"rewards/chosen": 3.51375337080522,
"rewards/margins": 9.902944892343848,
"rewards/rejected": -6.389191521538629,
"step": 1360
},
{
"epoch": 1.4022517911975436,
"grad_norm": 11.300486504343981,
"kl": 102.50377655029297,
"learning_rate": 6.430211372368983e-07,
"logits/chosen": -104613869.71428572,
"logits/rejected": -78341808.0,
"logps/chosen": -588.1082589285714,
"logps/rejected": -561.199462890625,
"loss": 0.2488,
"rewards/chosen": 3.8442611694335938,
"rewards/margins": 6.867713610331217,
"rewards/rejected": -3.0234524408976235,
"step": 1370
},
{
"epoch": 1.412487205731832,
"grad_norm": 7.278523044606318,
"kl": 47.79290008544922,
"learning_rate": 6.373032587376903e-07,
"logits/chosen": -98703773.53846154,
"logits/rejected": -88402276.57142857,
"logps/chosen": -518.7367412860577,
"logps/rejected": -640.8177315848214,
"loss": 0.2596,
"rewards/chosen": 3.898407275860126,
"rewards/margins": 13.543180486658118,
"rewards/rejected": -9.644773210797991,
"step": 1380
},
{
"epoch": 1.4227226202661207,
"grad_norm": 14.528128586654807,
"kl": 20.99447250366211,
"learning_rate": 6.31565892780403e-07,
"logits/chosen": -113441461.33333333,
"logits/rejected": -117415408.0,
"logps/chosen": -673.6360677083334,
"logps/rejected": -767.6175537109375,
"loss": 0.2178,
"rewards/chosen": 3.181301752726237,
"rewards/margins": 22.52920405069987,
"rewards/rejected": -19.347902297973633,
"step": 1390
},
{
"epoch": 1.4329580348004094,
"grad_norm": 13.997484831173727,
"kl": 23.745975494384766,
"learning_rate": 6.258098536696608e-07,
"logits/chosen": -94636196.57142857,
"logits/rejected": -112435381.33333333,
"logps/chosen": -549.872802734375,
"logps/rejected": -694.0374348958334,
"loss": 0.2152,
"rewards/chosen": 1.8581578390938895,
"rewards/margins": 16.185761497134255,
"rewards/rejected": -14.327603658040365,
"step": 1400
},
{
"epoch": 1.443193449334698,
"grad_norm": 7.521678155084116,
"kl": 91.66047668457031,
"learning_rate": 6.200359583603702e-07,
"logits/chosen": -91731765.33333333,
"logits/rejected": -93070952.0,
"logps/chosen": -454.1505940755208,
"logps/rejected": -862.4594116210938,
"loss": 0.2282,
"rewards/chosen": 4.978697141011556,
"rewards/margins": 37.34273370107015,
"rewards/rejected": -32.364036560058594,
"step": 1410
},
{
"epoch": 1.4534288638689867,
"grad_norm": 9.62091749430687,
"kl": 155.88140869140625,
"learning_rate": 6.142450263417684e-07,
"logits/chosen": -102225635.55555555,
"logits/rejected": -85442594.9090909,
"logps/chosen": -540.4261067708334,
"logps/rejected": -603.1096857244319,
"loss": 0.2332,
"rewards/chosen": 3.9674284193250866,
"rewards/margins": 13.474742927936592,
"rewards/rejected": -9.507314508611506,
"step": 1420
},
{
"epoch": 1.4636642784032754,
"grad_norm": 10.495215345293603,
"kl": 126.53300476074219,
"learning_rate": 6.084378795211142e-07,
"logits/chosen": -100052528.0,
"logits/rejected": -94666272.0,
"logps/chosen": -581.60009765625,
"logps/rejected": -689.3089192708334,
"loss": 0.2222,
"rewards/chosen": 2.338420867919922,
"rewards/margins": 16.052043914794922,
"rewards/rejected": -13.713623046875,
"step": 1430
},
{
"epoch": 1.473899692937564,
"grad_norm": 3.841823620667789,
"kl": 66.59071350097656,
"learning_rate": 6.026153421070332e-07,
"logits/chosen": -100694074.18181819,
"logits/rejected": -116525688.8888889,
"logps/chosen": -546.9574751420455,
"logps/rejected": -961.9046223958334,
"loss": 0.195,
"rewards/chosen": 4.302017905495384,
"rewards/margins": 37.5053429651742,
"rewards/rejected": -33.20332505967882,
"step": 1440
},
{
"epoch": 1.4841351074718525,
"grad_norm": 14.904727186940962,
"kl": 87.73085021972656,
"learning_rate": 5.967782404925392e-07,
"logits/chosen": -98680000.0,
"logits/rejected": -112079114.66666667,
"logps/chosen": -520.26025390625,
"logps/rejected": -843.9371744791666,
"loss": 0.2511,
"rewards/chosen": 3.4578888416290283,
"rewards/margins": 27.866142829259235,
"rewards/rejected": -24.408253987630207,
"step": 1450
},
{
"epoch": 1.4943705220061412,
"grad_norm": 13.06698843977878,
"kl": 151.13763427734375,
"learning_rate": 5.909274031377433e-07,
"logits/chosen": -95404913.77777778,
"logits/rejected": -88787351.27272727,
"logps/chosen": -568.8107096354166,
"logps/rejected": -707.0969460227273,
"loss": 0.2509,
"rewards/chosen": 3.700217776828342,
"rewards/margins": 16.992627500283596,
"rewards/rejected": -13.292409723455256,
"step": 1460
},
{
"epoch": 1.5046059365404298,
"grad_norm": 11.196107724803628,
"kl": 162.71450805664062,
"learning_rate": 5.850636604522717e-07,
"logits/chosen": -103681984.0,
"logits/rejected": -115164441.6,
"logps/chosen": -570.78681640625,
"logps/rejected": -804.131298828125,
"loss": 0.2339,
"rewards/chosen": 5.796274948120117,
"rewards/margins": 18.870234298706055,
"rewards/rejected": -13.073959350585938,
"step": 1470
},
{
"epoch": 1.5046059365404298,
"eval_logits/chosen": -98059872.0,
"eval_logits/rejected": -89680456.0,
"eval_logps/chosen": -486.9290771484375,
"eval_logps/rejected": -479.80731201171875,
"eval_loss": 0.35584360361099243,
"eval_rewards/chosen": 1.2830994129180908,
"eval_rewards/margins": -7.098632574081421,
"eval_rewards/rejected": 8.381731986999512,
"eval_runtime": 2.6347,
"eval_samples_per_second": 3.795,
"eval_steps_per_second": 0.759,
"kl": 0.0,
"step": 1470
},
{
"epoch": 1.5148413510747185,
"grad_norm": 16.203618211283683,
"kl": 92.46534729003906,
"learning_rate": 5.791878446774034e-07,
"logits/chosen": -103741664.0,
"logits/rejected": -82316051.2,
"logps/chosen": -515.66162109375,
"logps/rejected": -602.708349609375,
"loss": 0.2773,
"rewards/chosen": 4.238919830322265,
"rewards/margins": 13.160222625732422,
"rewards/rejected": -8.921302795410156,
"step": 1480
},
{
"epoch": 1.5250767656090072,
"grad_norm": 4.554873934008656,
"kl": 167.45574951171875,
"learning_rate": 5.733007897679528e-07,
"logits/chosen": -95108238.22222222,
"logits/rejected": -91586996.36363636,
"logps/chosen": -521.0322265625,
"logps/rejected": -721.2854225852273,
"loss": 0.1973,
"rewards/chosen": 4.9189779493543835,
"rewards/margins": 20.43809859921234,
"rewards/rejected": -15.519120649857955,
"step": 1490
},
{
"epoch": 1.5353121801432958,
"grad_norm": 7.6574893969363504,
"kl": 45.464637756347656,
"learning_rate": 5.674033312739047e-07,
"logits/chosen": -136661788.44444445,
"logits/rejected": -121526865.45454545,
"logps/chosen": -624.0289713541666,
"logps/rejected": -737.0136274857955,
"loss": 0.2013,
"rewards/chosen": 4.872588263617621,
"rewards/margins": 18.956630899448587,
"rewards/rejected": -14.084042635830967,
"step": 1500
},
{
"epoch": 1.5455475946775845,
"grad_norm": 8.371922803052616,
"kl": 122.37657165527344,
"learning_rate": 5.614963062218252e-07,
"logits/chosen": -112632277.33333333,
"logits/rejected": -97041344.0,
"logps/chosen": -567.5799967447916,
"logps/rejected": -660.11474609375,
"loss": 0.1998,
"rewards/chosen": 6.09868049621582,
"rewards/margins": 15.676589965820312,
"rewards/rejected": -9.577909469604492,
"step": 1510
},
{
"epoch": 1.555783009211873,
"grad_norm": 9.555101175645209,
"kl": 84.63465881347656,
"learning_rate": 5.555805529960626e-07,
"logits/chosen": -107756818.28571428,
"logits/rejected": -104269922.46153846,
"logps/chosen": -569.1482631138393,
"logps/rejected": -676.1059194711538,
"loss": 0.276,
"rewards/chosen": 5.877526419503348,
"rewards/margins": 21.615523621276186,
"rewards/rejected": -15.737997201772837,
"step": 1520
},
{
"epoch": 1.5660184237461618,
"grad_norm": 8.285346789381096,
"kl": 17.301956176757812,
"learning_rate": 5.496569112197548e-07,
"logits/chosen": -108150368.0,
"logits/rejected": -117954704.0,
"logps/chosen": -564.1471354166666,
"logps/rejected": -842.083740234375,
"loss": 0.2172,
"rewards/chosen": 6.367968241373698,
"rewards/margins": 25.966650644938152,
"rewards/rejected": -19.598682403564453,
"step": 1530
},
{
"epoch": 1.5762538382804503,
"grad_norm": 17.201078473501788,
"kl": 29.990673065185547,
"learning_rate": 5.437262216356628e-07,
"logits/chosen": -99166184.72727273,
"logits/rejected": -96667192.8888889,
"logps/chosen": -560.8868963068181,
"logps/rejected": -690.4375,
"loss": 0.2375,
"rewards/chosen": 2.5369420485063032,
"rewards/margins": 14.643823084205087,
"rewards/rejected": -12.106881035698784,
"step": 1540
},
{
"epoch": 1.586489252814739,
"grad_norm": 14.598976158627305,
"kl": 17.094257354736328,
"learning_rate": 5.377893259868427e-07,
"logits/chosen": -93399792.0,
"logits/rejected": -101074688.0,
"logps/chosen": -506.314208984375,
"logps/rejected": -705.162109375,
"loss": 0.1902,
"rewards/chosen": 3.808037519454956,
"rewards/margins": 19.942469994227093,
"rewards/rejected": -16.134432474772137,
"step": 1550
},
{
"epoch": 1.5967246673490276,
"grad_norm": 11.677222021035318,
"kl": 105.08064270019531,
"learning_rate": 5.318470668971779e-07,
"logits/chosen": -113351078.4,
"logits/rejected": -97037670.4,
"logps/chosen": -570.3861328125,
"logps/rejected": -638.00615234375,
"loss": 0.2409,
"rewards/chosen": 3.3090076446533203,
"rewards/margins": 12.0073673248291,
"rewards/rejected": -8.69835968017578,
"step": 1560
},
{
"epoch": 1.6069600818833163,
"grad_norm": 16.040864873166022,
"kl": 193.0989227294922,
"learning_rate": 5.259002877517853e-07,
"logits/chosen": -90070867.2,
"logits/rejected": -99008230.4,
"logps/chosen": -465.715625,
"logps/rejected": -680.6184895833334,
"loss": 0.2715,
"rewards/chosen": 4.33167724609375,
"rewards/margins": 15.996807861328126,
"rewards/rejected": -11.665130615234375,
"step": 1570
},
{
"epoch": 1.617195496417605,
"grad_norm": 24.30022370778095,
"kl": 89.72409057617188,
"learning_rate": 5.199498325773134e-07,
"logits/chosen": -104927464.72727273,
"logits/rejected": -88201393.77777778,
"logps/chosen": -567.3364701704545,
"logps/rejected": -562.6102430555555,
"loss": 0.2075,
"rewards/chosen": 4.681469310413707,
"rewards/margins": 8.131336558948863,
"rewards/rejected": -3.4498672485351562,
"step": 1580
},
{
"epoch": 1.6274309109518934,
"grad_norm": 10.016797523044985,
"kl": 32.81374740600586,
"learning_rate": 5.139965459221495e-07,
"logits/chosen": -107691754.66666667,
"logits/rejected": -97935697.45454545,
"logps/chosen": -572.6688368055555,
"logps/rejected": -638.0316051136364,
"loss": 0.2441,
"rewards/chosen": 5.759796990288629,
"rewards/margins": 18.01395570388948,
"rewards/rejected": -12.254158713600852,
"step": 1590
},
{
"epoch": 1.6376663254861823,
"grad_norm": 16.580461098226703,
"kl": 68.80343627929688,
"learning_rate": 5.080412727365535e-07,
"logits/chosen": -124645034.66666667,
"logits/rejected": -97441800.0,
"logps/chosen": -639.878173828125,
"logps/rejected": -744.6334228515625,
"loss": 0.1817,
"rewards/chosen": 5.480181376139323,
"rewards/margins": 26.614111582438152,
"rewards/rejected": -21.133930206298828,
"step": 1600
},
{
"epoch": 1.6479017400204707,
"grad_norm": 13.42103249531687,
"kl": 56.13240432739258,
"learning_rate": 5.020848582527335e-07,
"logits/chosen": -103806021.81818181,
"logits/rejected": -112090880.0,
"logps/chosen": -548.7734375,
"logps/rejected": -757.8369140625,
"loss": 0.2598,
"rewards/chosen": 2.9247055053710938,
"rewards/margins": 20.906888326009113,
"rewards/rejected": -17.98218282063802,
"step": 1610
},
{
"epoch": 1.6581371545547596,
"grad_norm": 12.634298219691603,
"kl": 141.90322875976562,
"learning_rate": 4.96128147864882e-07,
"logits/chosen": -89943796.36363636,
"logits/rejected": -102573880.8888889,
"logps/chosen": -538.3907137784091,
"logps/rejected": -753.2516818576389,
"loss": 0.1773,
"rewards/chosen": 5.100575186989524,
"rewards/margins": 21.259171649663134,
"rewards/rejected": -16.15859646267361,
"step": 1620
},
{
"epoch": 1.668372569089048,
"grad_norm": 10.28197062624758,
"kl": 107.90435791015625,
"learning_rate": 4.90171987009189e-07,
"logits/chosen": -106620205.71428572,
"logits/rejected": -113546080.0,
"logps/chosen": -552.3481096540179,
"logps/rejected": -906.138427734375,
"loss": 0.1918,
"rewards/chosen": 4.133395603724888,
"rewards/margins": 27.026680719284784,
"rewards/rejected": -22.893285115559895,
"step": 1630
},
{
"epoch": 1.6786079836233367,
"grad_norm": 17.96411547258075,
"kl": 112.21823120117188,
"learning_rate": 4.8421722104385e-07,
"logits/chosen": -75008458.66666667,
"logits/rejected": -94131072.0,
"logps/chosen": -432.4429117838542,
"logps/rejected": -667.931640625,
"loss": 0.2239,
"rewards/chosen": 3.447942097981771,
"rewards/margins": 15.51882571265811,
"rewards/rejected": -12.070883614676339,
"step": 1640
},
{
"epoch": 1.6888433981576254,
"grad_norm": 10.395637391051638,
"kl": 23.069644927978516,
"learning_rate": 4.78264695129083e-07,
"logits/chosen": -116295520.0,
"logits/rejected": -107159306.66666667,
"logps/chosen": -560.1647338867188,
"logps/rejected": -712.3726399739584,
"loss": 0.2036,
"rewards/chosen": 3.4754228591918945,
"rewards/margins": 13.128763516743978,
"rewards/rejected": -9.653340657552084,
"step": 1650
},
{
"epoch": 1.699078812691914,
"grad_norm": 8.156359676723904,
"kl": 80.28675842285156,
"learning_rate": 4.723152541071761e-07,
"logits/chosen": -128034397.0909091,
"logits/rejected": -115200327.1111111,
"logps/chosen": -657.2834250710227,
"logps/rejected": -728.0647786458334,
"loss": 0.2249,
"rewards/chosen": 4.118690490722656,
"rewards/margins": 21.072269863552517,
"rewards/rejected": -16.95357937282986,
"step": 1660
},
{
"epoch": 1.7093142272262027,
"grad_norm": 11.037173163466019,
"kl": 70.9041748046875,
"learning_rate": 4.663697423825777e-07,
"logits/chosen": -111218501.81818181,
"logits/rejected": -110178133.33333333,
"logps/chosen": -608.4298650568181,
"logps/rejected": -630.6374240451389,
"loss": 0.1887,
"rewards/chosen": 1.0514965057373047,
"rewards/margins": 6.370984183417426,
"rewards/rejected": -5.319487677680121,
"step": 1670
},
{
"epoch": 1.7195496417604912,
"grad_norm": 10.163513417285465,
"kl": 90.06620788574219,
"learning_rate": 4.604290038020513e-07,
"logits/chosen": -104523801.6,
"logits/rejected": -105088793.6,
"logps/chosen": -601.39541015625,
"logps/rejected": -695.315625,
"loss": 0.2451,
"rewards/chosen": 3.491130065917969,
"rewards/margins": 16.996690368652345,
"rewards/rejected": -13.505560302734375,
"step": 1680
},
{
"epoch": 1.72978505629478,
"grad_norm": 8.51268769415796,
"kl": 78.53779602050781,
"learning_rate": 4.5449388153490786e-07,
"logits/chosen": -113766317.71428572,
"logits/rejected": -97993846.15384616,
"logps/chosen": -695.9150390625,
"logps/rejected": -615.8410832331731,
"loss": 0.2208,
"rewards/chosen": -1.227851186479841,
"rewards/margins": 6.885328533885243,
"rewards/rejected": -8.113179720365084,
"step": 1690
},
{
"epoch": 1.7400204708290685,
"grad_norm": 16.94149194280564,
"kl": 84.0811767578125,
"learning_rate": 4.485652179533347e-07,
"logits/chosen": -94786432.0,
"logits/rejected": -94003632.0,
"logps/chosen": -496.9593912760417,
"logps/rejected": -667.4026489257812,
"loss": 0.2518,
"rewards/chosen": 4.43801212310791,
"rewards/margins": 15.846646308898926,
"rewards/rejected": -11.408634185791016,
"step": 1700
},
{
"epoch": 1.7502558853633572,
"grad_norm": 11.672933785919794,
"kl": 37.66474533081055,
"learning_rate": 4.426438545128372e-07,
"logits/chosen": -112977585.77777778,
"logits/rejected": -100742376.72727273,
"logps/chosen": -576.189453125,
"logps/rejected": -709.9443359375,
"loss": 0.2097,
"rewards/chosen": 3.0994716220431857,
"rewards/margins": 16.225449571705827,
"rewards/rejected": -13.125977949662643,
"step": 1710
},
{
"epoch": 1.7604912998976459,
"grad_norm": 0.8936706403580842,
"kl": 46.1988525390625,
"learning_rate": 4.367306316328121e-07,
"logits/chosen": -108418517.33333333,
"logits/rejected": -93911760.0,
"logps/chosen": -573.855224609375,
"logps/rejected": -722.8265380859375,
"loss": 0.1714,
"rewards/chosen": 4.775790532430013,
"rewards/margins": 19.311710675557453,
"rewards/rejected": -14.535920143127441,
"step": 1720
},
{
"epoch": 1.7707267144319345,
"grad_norm": 8.657146619452396,
"kl": 213.92623901367188,
"learning_rate": 4.30826388577265e-07,
"logits/chosen": -104575015.38461539,
"logits/rejected": -98068845.71428572,
"logps/chosen": -534.2808743990385,
"logps/rejected": -600.8302176339286,
"loss": 0.1977,
"rewards/chosen": 5.302577678973858,
"rewards/margins": 9.943926381540823,
"rewards/rejected": -4.641348702566964,
"step": 1730
},
{
"epoch": 1.7809621289662232,
"grad_norm": 10.789363563455533,
"kl": 131.159423828125,
"learning_rate": 4.2493196333569584e-07,
"logits/chosen": -97828420.92307693,
"logits/rejected": -83620754.28571428,
"logps/chosen": -487.26893028846155,
"logps/rejected": -598.9041573660714,
"loss": 0.2351,
"rewards/chosen": 5.452203603891226,
"rewards/margins": 12.849205142849094,
"rewards/rejected": -7.397001538957868,
"step": 1740
},
{
"epoch": 1.7911975435005116,
"grad_norm": 12.092749709288178,
"kl": 166.184814453125,
"learning_rate": 4.190481925041606e-07,
"logits/chosen": -105054982.4,
"logits/rejected": -86679507.2,
"logps/chosen": -531.07646484375,
"logps/rejected": -626.576318359375,
"loss": 0.2203,
"rewards/chosen": 6.711921691894531,
"rewards/margins": 16.232135009765624,
"rewards/rejected": -9.520213317871093,
"step": 1750
},
{
"epoch": 1.8014329580348005,
"grad_norm": 10.247946491106909,
"kl": 195.1206512451172,
"learning_rate": 4.131759111665348e-07,
"logits/chosen": -104919637.33333333,
"logits/rejected": -98690360.0,
"logps/chosen": -521.9753824869791,
"logps/rejected": -756.753173828125,
"loss": 0.2021,
"rewards/chosen": 5.4654890696207685,
"rewards/margins": 21.19386164347331,
"rewards/rejected": -15.728372573852539,
"step": 1760
},
{
"epoch": 1.805527123848516,
"eval_logits/chosen": -95303792.0,
"eval_logits/rejected": -87230504.0,
"eval_logps/chosen": -472.82220458984375,
"eval_logps/rejected": -488.859130859375,
"eval_loss": 0.2824169099330902,
"eval_rewards/chosen": 2.69378662109375,
"eval_rewards/margins": -4.782763957977295,
"eval_rewards/rejected": 7.476550579071045,
"eval_runtime": 2.6295,
"eval_samples_per_second": 3.803,
"eval_steps_per_second": 0.761,
"kl": 0.0,
"step": 1764
},
{
"epoch": 1.811668372569089,
"grad_norm": 8.176862722781726,
"kl": 152.9454345703125,
"learning_rate": 4.0731595277598986e-07,
"logits/chosen": -95640564.36363636,
"logits/rejected": -105284501.33333333,
"logps/chosen": -486.55069247159093,
"logps/rejected": -683.3498263888889,
"loss": 0.2283,
"rewards/chosen": 6.601251775568182,
"rewards/margins": 14.320289997139362,
"rewards/rejected": -7.71903822157118,
"step": 1770
},
{
"epoch": 1.8219037871033776,
"grad_norm": 12.366163049603129,
"kl": 134.93138122558594,
"learning_rate": 4.0146914903669997e-07,
"logits/chosen": -105663914.66666667,
"logits/rejected": -86276560.0,
"logps/chosen": -558.2360026041666,
"logps/rejected": -618.48046875,
"loss": 0.2848,
"rewards/chosen": 6.694384256998698,
"rewards/margins": 12.43185583750407,
"rewards/rejected": -5.737471580505371,
"step": 1780
},
{
"epoch": 1.8321392016376663,
"grad_norm": 21.06542323848456,
"kl": 114.84019470214844,
"learning_rate": 3.9563632978579997e-07,
"logits/chosen": -130333696.0,
"logits/rejected": -118268970.66666667,
"logps/chosen": -678.7927024147727,
"logps/rejected": -766.8365885416666,
"loss": 0.1941,
"rewards/chosen": -1.294362328269265,
"rewards/margins": 13.073830556387852,
"rewards/rejected": -14.368192884657118,
"step": 1790
},
{
"epoch": 1.842374616171955,
"grad_norm": 7.508684819898513,
"kl": 144.90069580078125,
"learning_rate": 3.898183228756049e-07,
"logits/chosen": -97050137.6,
"logits/rejected": -90996108.8,
"logps/chosen": -500.40439453125,
"logps/rejected": -676.8037109375,
"loss": 0.2811,
"rewards/chosen": 5.051666259765625,
"rewards/margins": 15.36988525390625,
"rewards/rejected": -10.318218994140626,
"step": 1800
},
{
"epoch": 1.8526100307062436,
"grad_norm": 12.437592526611242,
"kl": 114.08500671386719,
"learning_rate": 3.840159540561134e-07,
"logits/chosen": -105590414.22222222,
"logits/rejected": -89462213.81818181,
"logps/chosen": -513.1369357638889,
"logps/rejected": -584.1775568181819,
"loss": 0.2467,
"rewards/chosen": 3.517538070678711,
"rewards/margins": 8.527439637617633,
"rewards/rejected": -5.009901566938921,
"step": 1810
},
{
"epoch": 1.862845445240532,
"grad_norm": 15.091845848621832,
"kl": 47.479251861572266,
"learning_rate": 3.782300468578103e-07,
"logits/chosen": -108857937.45454545,
"logits/rejected": -103325866.66666667,
"logps/chosen": -555.7039240056819,
"logps/rejected": -626.8645833333334,
"loss": 0.2541,
"rewards/chosen": 3.7626831748268823,
"rewards/margins": 11.195115561437126,
"rewards/rejected": -7.432432386610243,
"step": 1820
},
{
"epoch": 1.873080859774821,
"grad_norm": 19.183770345291794,
"kl": 41.419944763183594,
"learning_rate": 3.7246142247478035e-07,
"logits/chosen": -101665499.42857143,
"logits/rejected": -109967360.0,
"logps/chosen": -559.6848842075893,
"logps/rejected": -773.8717447916666,
"loss": 0.2448,
"rewards/chosen": 1.5084868839808874,
"rewards/margins": 14.50175648643857,
"rewards/rejected": -12.993269602457682,
"step": 1830
},
{
"epoch": 1.8833162743091094,
"grad_norm": 9.069536625547933,
"kl": 45.53041076660156,
"learning_rate": 3.6671089964815825e-07,
"logits/chosen": -108608816.0,
"logits/rejected": -89606421.33333333,
"logps/chosen": -513.4603271484375,
"logps/rejected": -684.734375,
"loss": 0.265,
"rewards/chosen": 4.741620063781738,
"rewards/margins": 17.57877826690674,
"rewards/rejected": -12.837158203125,
"step": 1840
},
{
"epoch": 1.8935516888433983,
"grad_norm": 19.468688850390492,
"kl": 90.07597351074219,
"learning_rate": 3.6097929454992404e-07,
"logits/chosen": -106665146.18181819,
"logits/rejected": -86712704.0,
"logps/chosen": -654.1851917613636,
"logps/rejected": -702.7716471354166,
"loss": 0.1897,
"rewards/chosen": 4.631488106467507,
"rewards/margins": 19.50170362838591,
"rewards/rejected": -14.870215521918404,
"step": 1850
},
{
"epoch": 1.9037871033776868,
"grad_norm": 10.66508500138114,
"kl": 153.10336303710938,
"learning_rate": 3.5526742066706316e-07,
"logits/chosen": -99318946.9090909,
"logits/rejected": -107160832.0,
"logps/chosen": -510.36629971590907,
"logps/rejected": -641.9811197916666,
"loss": 0.2336,
"rewards/chosen": 5.530141657049006,
"rewards/margins": 12.735254538179648,
"rewards/rejected": -7.205112881130642,
"step": 1860
},
{
"epoch": 1.9140225179119754,
"grad_norm": 8.059119234031746,
"kl": 44.486595153808594,
"learning_rate": 3.4957608868610927e-07,
"logits/chosen": -114195315.2,
"logits/rejected": -108387635.2,
"logps/chosen": -591.734326171875,
"logps/rejected": -637.55859375,
"loss": 0.2205,
"rewards/chosen": 1.337228012084961,
"rewards/margins": 5.68485221862793,
"rewards/rejected": -4.347624206542969,
"step": 1870
},
{
"epoch": 1.924257932446264,
"grad_norm": 10.00472400660884,
"kl": 281.92333984375,
"learning_rate": 3.439061063780826e-07,
"logits/chosen": -91237228.3076923,
"logits/rejected": -98920045.71428572,
"logps/chosen": -496.14002403846155,
"logps/rejected": -737.2635323660714,
"loss": 0.2072,
"rewards/chosen": 5.549809382512019,
"rewards/margins": 21.350414066524294,
"rewards/rejected": -15.800604684012276,
"step": 1880
},
{
"epoch": 1.9344933469805528,
"grad_norm": 10.522188816412612,
"kl": 322.6961669921875,
"learning_rate": 3.382582784838428e-07,
"logits/chosen": -121360153.6,
"logits/rejected": -107659980.8,
"logps/chosen": -605.14111328125,
"logps/rejected": -678.30458984375,
"loss": 0.2686,
"rewards/chosen": 4.314989471435547,
"rewards/margins": 10.155291748046874,
"rewards/rejected": -5.840302276611328,
"step": 1890
},
{
"epoch": 1.9447287615148414,
"grad_norm": 12.824401757150811,
"kl": 382.4858093261719,
"learning_rate": 3.326334065998737e-07,
"logits/chosen": -101046491.42857143,
"logits/rejected": -115594496.0,
"logps/chosen": -531.6218610491071,
"logps/rejected": -693.0041316105769,
"loss": 0.1756,
"rewards/chosen": 6.1677044459751675,
"rewards/margins": 14.67305441217108,
"rewards/rejected": -8.505349966195913,
"step": 1900
},
{
"epoch": 1.9549641760491299,
"grad_norm": 20.386712706894404,
"kl": 99.92483520507812,
"learning_rate": 3.270322890645105e-07,
"logits/chosen": -101339978.66666667,
"logits/rejected": -103863136.0,
"logps/chosen": -533.7759602864584,
"logps/rejected": -736.4990234375,
"loss": 0.2402,
"rewards/chosen": 5.52944819132487,
"rewards/margins": 17.632124582926433,
"rewards/rejected": -12.102676391601562,
"step": 1910
},
{
"epoch": 1.9651995905834188,
"grad_norm": 5.038959800127083,
"kl": 48.24281311035156,
"learning_rate": 3.214557208446327e-07,
"logits/chosen": -100319953.45454545,
"logits/rejected": -92343381.33333333,
"logps/chosen": -546.0839399857955,
"logps/rejected": -560.3200412326389,
"loss": 0.2344,
"rewards/chosen": 4.807866876775568,
"rewards/margins": 3.9413422863892835,
"rewards/rejected": 0.8665245903862847,
"step": 1920
},
{
"epoch": 1.9754350051177072,
"grad_norm": 6.680434736938146,
"kl": 124.0519790649414,
"learning_rate": 3.159044934228348e-07,
"logits/chosen": -105253174.85714285,
"logits/rejected": -104486037.33333333,
"logps/chosen": -535.1736886160714,
"logps/rejected": -598.3387044270834,
"loss": 0.2006,
"rewards/chosen": 5.359432765415737,
"rewards/margins": 8.449786413283576,
"rewards/rejected": -3.0903536478678384,
"step": 1930
},
{
"epoch": 1.9856704196519959,
"grad_norm": 11.230039906447315,
"kl": 122.17435455322266,
"learning_rate": 3.1037939468509e-07,
"logits/chosen": -97059254.85714285,
"logits/rejected": -96700928.0,
"logps/chosen": -553.3533761160714,
"logps/rejected": -575.9772761418269,
"loss": 0.2392,
"rewards/chosen": 5.7837949480329245,
"rewards/margins": 5.070555718390496,
"rewards/rejected": 0.7132392296424279,
"step": 1940
},
{
"epoch": 1.9959058341862845,
"grad_norm": 7.4274799670453575,
"kl": 186.48150634765625,
"learning_rate": 3.0488120880892663e-07,
"logits/chosen": -96918192.0,
"logits/rejected": -95595184.0,
"logps/chosen": -471.45294189453125,
"logps/rejected": -576.8538818359375,
"loss": 0.2082,
"rewards/chosen": 5.839811325073242,
"rewards/margins": 13.376245975494385,
"rewards/rejected": -7.536434650421143,
"step": 1950
},
{
"epoch": 2.006141248720573,
"grad_norm": 10.584557120336866,
"kl": 125.15768432617188,
"learning_rate": 2.9941071615212903e-07,
"logits/chosen": -109870899.2,
"logits/rejected": -80665926.4,
"logps/chosen": -596.23798828125,
"logps/rejected": -555.70048828125,
"loss": 0.1868,
"rewards/chosen": 6.416107940673828,
"rewards/margins": 7.968547344207764,
"rewards/rejected": -1.5524394035339355,
"step": 1960
},
{
"epoch": 2.016376663254862,
"grad_norm": 8.808290014986497,
"kl": 202.176513671875,
"learning_rate": 2.9396869314198125e-07,
"logits/chosen": -94040345.6,
"logits/rejected": -91740160.0,
"logps/chosen": -494.99248046875,
"logps/rejected": -640.023974609375,
"loss": 0.1113,
"rewards/chosen": 6.723219299316407,
"rewards/margins": 12.369775009155273,
"rewards/rejected": -5.646555709838867,
"step": 1970
},
{
"epoch": 2.0266120777891503,
"grad_norm": 7.577887182750462,
"kl": 10.694358825683594,
"learning_rate": 2.8855591216507e-07,
"logits/chosen": -112530150.4,
"logits/rejected": -101618681.6,
"logps/chosen": -533.929443359375,
"logps/rejected": -578.36171875,
"loss": 0.1269,
"rewards/chosen": 5.4773094177246096,
"rewards/margins": 14.136001586914062,
"rewards/rejected": -8.658692169189454,
"step": 1980
},
{
"epoch": 2.036847492323439,
"grad_norm": 14.922868038999017,
"kl": 75.09333801269531,
"learning_rate": 2.831731414576576e-07,
"logits/chosen": -107291744.0,
"logits/rejected": -97015648.0,
"logps/chosen": -555.2694091796875,
"logps/rejected": -593.9874674479166,
"loss": 0.1077,
"rewards/chosen": 4.765722274780273,
"rewards/margins": 11.604981740315754,
"rewards/rejected": -6.8392594655354815,
"step": 1990
},
{
"epoch": 2.0470829068577276,
"grad_norm": 4.235861378584866,
"kl": 50.54994583129883,
"learning_rate": 2.7782114499664846e-07,
"logits/chosen": -92629866.66666667,
"logits/rejected": -105082845.0909091,
"logps/chosen": -467.57025824652777,
"logps/rejected": -619.3458806818181,
"loss": 0.1028,
"rewards/chosen": 5.671641031901042,
"rewards/margins": 12.007132559111625,
"rewards/rejected": -6.335491527210582,
"step": 2000
},
{
"epoch": 2.0573183213920165,
"grad_norm": 5.227781817757105,
"kl": 108.27552795410156,
"learning_rate": 2.725006823911562e-07,
"logits/chosen": -107582277.81818181,
"logits/rejected": -94774528.0,
"logps/chosen": -517.8961292613636,
"logps/rejected": -567.1627604166666,
"loss": 0.1834,
"rewards/chosen": 6.7398598410866475,
"rewards/margins": 13.737200149382003,
"rewards/rejected": -6.997340308295356,
"step": 2010
},
{
"epoch": 2.067553735926305,
"grad_norm": 10.398496742073478,
"kl": 116.98248291015625,
"learning_rate": 2.6721250877469243e-07,
"logits/chosen": -134676784.0,
"logits/rejected": -104658389.33333333,
"logps/chosen": -541.9503173828125,
"logps/rejected": -725.39892578125,
"loss": 0.1362,
"rewards/chosen": 6.3118109703063965,
"rewards/margins": 20.48145945866903,
"rewards/rejected": -14.16964848836263,
"step": 2020
},
{
"epoch": 2.0777891504605934,
"grad_norm": 5.107311910263906,
"kl": 44.497169494628906,
"learning_rate": 2.6195737469799194e-07,
"logits/chosen": -116298069.33333333,
"logits/rejected": -106568624.0,
"logps/chosen": -578.7025960286459,
"logps/rejected": -693.4928588867188,
"loss": 0.1087,
"rewards/chosen": 5.359598795572917,
"rewards/margins": 11.766344706217449,
"rewards/rejected": -6.406745910644531,
"step": 2030
},
{
"epoch": 2.0880245649948823,
"grad_norm": 10.206921524937323,
"kl": 67.50515747070312,
"learning_rate": 2.567360260224855e-07,
"logits/chosen": -113804869.81818181,
"logits/rejected": -130537272.8888889,
"logps/chosen": -587.5666725852273,
"logps/rejected": -769.2184787326389,
"loss": 0.1701,
"rewards/chosen": 3.0162519975142046,
"rewards/margins": 18.496425744258996,
"rewards/rejected": -15.480173746744791,
"step": 2040
},
{
"epoch": 2.0982599795291708,
"grad_norm": 11.147368052934937,
"kl": 40.51292037963867,
"learning_rate": 2.5154920381444025e-07,
"logits/chosen": -132331107.55555555,
"logits/rejected": -110937320.72727273,
"logps/chosen": -654.6213650173611,
"logps/rejected": -708.1742720170455,
"loss": 0.1052,
"rewards/chosen": 5.017095353868273,
"rewards/margins": 18.23353156658134,
"rewards/rejected": -13.216436212713068,
"step": 2050
},
{
"epoch": 2.1064483111566017,
"eval_logits/chosen": -104192184.0,
"eval_logits/rejected": -93659376.0,
"eval_logps/chosen": -475.7626953125,
"eval_logps/rejected": -484.2992858886719,
"eval_loss": 0.30316784977912903,
"eval_rewards/chosen": 2.399737596511841,
"eval_rewards/margins": -5.53279709815979,
"eval_rewards/rejected": 7.932534694671631,
"eval_runtime": 2.6298,
"eval_samples_per_second": 3.803,
"eval_steps_per_second": 0.761,
"kl": 0.0,
"step": 2058
},
{
"epoch": 2.1084953940634596,
"grad_norm": 12.754723605905042,
"kl": 127.5950927734375,
"learning_rate": 2.463976442397802e-07,
"logits/chosen": -112657896.72727273,
"logits/rejected": -103738140.44444445,
"logps/chosen": -540.5254794034091,
"logps/rejected": -752.6629774305555,
"loss": 0.1219,
"rewards/chosen": 6.787003950639204,
"rewards/margins": 20.9433997568458,
"rewards/rejected": -14.156395806206596,
"step": 2060
},
{
"epoch": 2.118730808597748,
"grad_norm": 11.192314383278982,
"kl": 104.99942016601562,
"learning_rate": 2.4128207845960206e-07,
"logits/chosen": -100784049.23076923,
"logits/rejected": -123069549.71428572,
"logps/chosen": -486.64881310096155,
"logps/rejected": -850.0779854910714,
"loss": 0.1127,
"rewards/chosen": 6.771697411170373,
"rewards/margins": 32.08223556686234,
"rewards/rejected": -25.310538155691965,
"step": 2070
},
{
"epoch": 2.128966223132037,
"grad_norm": 4.699379149769008,
"kl": 65.97349548339844,
"learning_rate": 2.3620323252640205e-07,
"logits/chosen": -107415344.0,
"logits/rejected": -99578250.66666667,
"logps/chosen": -487.5479431152344,
"logps/rejected": -701.464599609375,
"loss": 0.1361,
"rewards/chosen": 5.433587551116943,
"rewards/margins": 23.49019765853882,
"rewards/rejected": -18.056610107421875,
"step": 2080
},
{
"epoch": 2.1392016376663254,
"grad_norm": 6.473056947414698,
"kl": 103.8187255859375,
"learning_rate": 2.3116182728102634e-07,
"logits/chosen": -111746880.0,
"logits/rejected": -113652608.0,
"logps/chosen": -527.4331665039062,
"logps/rejected": -751.0619303385416,
"loss": 0.1384,
"rewards/chosen": 6.121653079986572,
"rewards/margins": 20.22429895401001,
"rewards/rejected": -14.102645874023438,
"step": 2090
},
{
"epoch": 2.1494370522006143,
"grad_norm": 10.712281475799484,
"kl": 30.688316345214844,
"learning_rate": 2.2615857825036193e-07,
"logits/chosen": -121882752.0,
"logits/rejected": -131416184.0,
"logps/chosen": -552.6856689453125,
"logps/rejected": -909.7028198242188,
"loss": 0.1264,
"rewards/chosen": 4.177041689554851,
"rewards/margins": 28.032979647318523,
"rewards/rejected": -23.855937957763672,
"step": 2100
},
{
"epoch": 2.1596724667349028,
"grad_norm": 1.8332271817652683,
"kl": 39.04943084716797,
"learning_rate": 2.2119419554578368e-07,
"logits/chosen": -119259346.28571428,
"logits/rejected": -93369898.66666667,
"logps/chosen": -523.9795270647321,
"logps/rejected": -705.1712239583334,
"loss": 0.0919,
"rewards/chosen": 6.490928104945591,
"rewards/margins": 26.213579268682572,
"rewards/rejected": -19.72265116373698,
"step": 2110
},
{
"epoch": 2.169907881269191,
"grad_norm": 8.092134429001227,
"kl": 161.7046661376953,
"learning_rate": 2.1626938376236602e-07,
"logits/chosen": -126143765.33333333,
"logits/rejected": -112001426.28571428,
"logps/chosen": -463.4314371744792,
"logps/rejected": -742.9820731026786,
"loss": 0.1189,
"rewards/chosen": 8.039971669514975,
"rewards/margins": 22.077065785725914,
"rewards/rejected": -14.037094116210938,
"step": 2120
},
{
"epoch": 2.18014329580348,
"grad_norm": 8.024062355675337,
"kl": 97.97288513183594,
"learning_rate": 2.1138484187888095e-07,
"logits/chosen": -141612469.33333334,
"logits/rejected": -105401728.0,
"logps/chosen": -591.967529296875,
"logps/rejected": -688.3210100446429,
"loss": 0.1071,
"rewards/chosen": 8.986672719319662,
"rewards/margins": 22.672740572974796,
"rewards/rejected": -13.686067853655134,
"step": 2130
},
{
"epoch": 2.1903787103377685,
"grad_norm": 15.25176970177958,
"kl": 106.90888977050781,
"learning_rate": 2.0654126315859162e-07,
"logits/chosen": -106417353.14285715,
"logits/rejected": -109585565.53846154,
"logps/chosen": -549.0357840401786,
"logps/rejected": -735.4391526442307,
"loss": 0.1331,
"rewards/chosen": 4.569193158830915,
"rewards/margins": 20.35442100776421,
"rewards/rejected": -15.785227848933292,
"step": 2140
},
{
"epoch": 2.2006141248720574,
"grad_norm": 4.424610131492461,
"kl": 66.78227233886719,
"learning_rate": 2.017393350508572e-07,
"logits/chosen": -106947863.27272727,
"logits/rejected": -103769792.0,
"logps/chosen": -504.95876242897725,
"logps/rejected": -635.9586588541666,
"loss": 0.0994,
"rewards/chosen": 5.044010509144176,
"rewards/margins": 11.438200979521781,
"rewards/rejected": -6.3941904703776045,
"step": 2150
},
{
"epoch": 2.210849539406346,
"grad_norm": 12.489708937727526,
"kl": 99.99842834472656,
"learning_rate": 1.969797390935643e-07,
"logits/chosen": -111711104.0,
"logits/rejected": -113755212.8,
"logps/chosen": -533.645703125,
"logps/rejected": -773.666796875,
"loss": 0.1523,
"rewards/chosen": 5.977291870117187,
"rewards/margins": 20.843841552734375,
"rewards/rejected": -14.866549682617187,
"step": 2160
},
{
"epoch": 2.2210849539406348,
"grad_norm": 4.670440511594177,
"kl": 149.47491455078125,
"learning_rate": 1.9226315081639417e-07,
"logits/chosen": -125969038.22222222,
"logits/rejected": -112787362.9090909,
"logps/chosen": -551.9454752604166,
"logps/rejected": -647.052734375,
"loss": 0.1559,
"rewards/chosen": 8.319302876790365,
"rewards/margins": 19.287110993356414,
"rewards/rejected": -10.96780811656605,
"step": 2170
},
{
"epoch": 2.231320368474923,
"grad_norm": 16.031191242535417,
"kl": 60.552860260009766,
"learning_rate": 1.8759023964494663e-07,
"logits/chosen": -118854707.2,
"logits/rejected": -101221004.8,
"logps/chosen": -557.2357421875,
"logps/rejected": -587.901953125,
"loss": 0.1241,
"rewards/chosen": 5.938985824584961,
"rewards/margins": 11.856063842773438,
"rewards/rejected": -5.917078018188477,
"step": 2180
},
{
"epoch": 2.241555783009212,
"grad_norm": 12.757199732588536,
"kl": 122.61748504638672,
"learning_rate": 1.8296166880572805e-07,
"logits/chosen": -121700633.6,
"logits/rejected": -103405388.8,
"logps/chosen": -503.8498046875,
"logps/rejected": -634.86533203125,
"loss": 0.1402,
"rewards/chosen": 5.459800338745117,
"rewards/margins": 11.527409744262695,
"rewards/rejected": -6.067609405517578,
"step": 2190
},
{
"epoch": 2.2517911975435005,
"grad_norm": 1.807781957810491,
"kl": 54.06723403930664,
"learning_rate": 1.7837809523201885e-07,
"logits/chosen": -118060302.22222222,
"logits/rejected": -99475618.9090909,
"logps/chosen": -570.5062391493055,
"logps/rejected": -619.9396306818181,
"loss": 0.1579,
"rewards/chosen": 4.121121724446614,
"rewards/margins": 14.215212966456559,
"rewards/rejected": -10.094091242009943,
"step": 2200
},
{
"epoch": 2.262026612077789,
"grad_norm": 9.781604616798875,
"kl": 64.63507080078125,
"learning_rate": 1.7384016947063562e-07,
"logits/chosen": -121720960.0,
"logits/rejected": -132511338.66666667,
"logps/chosen": -631.6351841517857,
"logps/rejected": -796.120361328125,
"loss": 0.1219,
"rewards/chosen": 6.886650085449219,
"rewards/margins": 25.14352289835612,
"rewards/rejected": -18.256872812906902,
"step": 2210
},
{
"epoch": 2.272262026612078,
"grad_norm": 21.39354262825075,
"kl": 66.86387634277344,
"learning_rate": 1.6934853558959861e-07,
"logits/chosen": -110275956.36363636,
"logits/rejected": -107321664.0,
"logps/chosen": -514.1443093039773,
"logps/rejected": -617.8937717013889,
"loss": 0.159,
"rewards/chosen": 6.370298212224787,
"rewards/margins": 15.696105571708294,
"rewards/rejected": -9.325807359483507,
"step": 2220
},
{
"epoch": 2.2824974411463663,
"grad_norm": 22.076841086039263,
"kl": 36.765682220458984,
"learning_rate": 1.6490383108671923e-07,
"logits/chosen": -111057954.9090909,
"logits/rejected": -102580707.55555555,
"logps/chosen": -540.7732599431819,
"logps/rejected": -714.6677517361111,
"loss": 0.1526,
"rewards/chosen": 4.920051574707031,
"rewards/margins": 16.855376349555122,
"rewards/rejected": -11.935324774848091,
"step": 2230
},
{
"epoch": 2.292732855680655,
"grad_norm": 17.947879888518305,
"kl": 15.610729217529297,
"learning_rate": 1.605066867991207e-07,
"logits/chosen": -102536045.71428572,
"logits/rejected": -113235712.0,
"logps/chosen": -513.5811941964286,
"logps/rejected": -585.6139322916666,
"loss": 0.1546,
"rewards/chosen": 5.293537139892578,
"rewards/margins": 12.563283284505207,
"rewards/rejected": -7.26974614461263,
"step": 2240
},
{
"epoch": 2.3029682702149437,
"grad_norm": 11.405668194591636,
"kl": 12.592327117919922,
"learning_rate": 1.5615772681370154e-07,
"logits/chosen": -121222873.6,
"logits/rejected": -105986884.26666667,
"logps/chosen": -515.24560546875,
"logps/rejected": -681.1936848958334,
"loss": 0.1456,
"rewards/chosen": 4.4692131042480465,
"rewards/margins": 15.419220225016275,
"rewards/rejected": -10.950007120768229,
"step": 2250
},
{
"epoch": 2.313203684749232,
"grad_norm": 10.569983905365673,
"kl": 64.39910888671875,
"learning_rate": 1.5185756837856044e-07,
"logits/chosen": -111442368.0,
"logits/rejected": -115095463.38461539,
"logps/chosen": -488.7227260044643,
"logps/rejected": -654.4423076923077,
"loss": 0.135,
"rewards/chosen": 4.371343885149274,
"rewards/margins": 14.086171789483709,
"rewards/rejected": -9.714827904334435,
"step": 2260
},
{
"epoch": 2.323439099283521,
"grad_norm": 3.354324936297212,
"kl": 22.279205322265625,
"learning_rate": 1.4760682181539014e-07,
"logits/chosen": -130732770.46153846,
"logits/rejected": -109831277.71428572,
"logps/chosen": -586.5646033653846,
"logps/rejected": -723.4093191964286,
"loss": 0.1786,
"rewards/chosen": 5.04144521859976,
"rewards/margins": 24.284674675910026,
"rewards/rejected": -19.243229457310267,
"step": 2270
},
{
"epoch": 2.3336745138178094,
"grad_norm": 25.305933395761674,
"kl": 39.49127960205078,
"learning_rate": 1.4340609043285352e-07,
"logits/chosen": -116073890.9090909,
"logits/rejected": -128906240.0,
"logps/chosen": -541.4201882102273,
"logps/rejected": -702.7337239583334,
"loss": 0.1394,
"rewards/chosen": 4.314708709716797,
"rewards/margins": 15.29558605617947,
"rewards/rejected": -10.980877346462673,
"step": 2280
},
{
"epoch": 2.3439099283520983,
"grad_norm": 7.496154801810936,
"kl": 35.06755065917969,
"learning_rate": 1.392559704409565e-07,
"logits/chosen": -106620288.0,
"logits/rejected": -95783952.0,
"logps/chosen": -490.46142578125,
"logps/rejected": -753.1649169921875,
"loss": 0.1545,
"rewards/chosen": 4.967744827270508,
"rewards/margins": 30.458803176879883,
"rewards/rejected": -25.491058349609375,
"step": 2290
},
{
"epoch": 2.3541453428863868,
"grad_norm": 10.112974863426174,
"kl": 45.81691360473633,
"learning_rate": 1.351570508664281e-07,
"logits/chosen": -123060982.15384616,
"logits/rejected": -106164790.85714285,
"logps/chosen": -555.3134014423077,
"logps/rejected": -700.9796316964286,
"loss": 0.0849,
"rewards/chosen": 6.32999772291917,
"rewards/margins": 20.283706916557563,
"rewards/rejected": -13.953709193638392,
"step": 2300
},
{
"epoch": 2.3643807574206757,
"grad_norm": 12.497011915287993,
"kl": 54.70706558227539,
"learning_rate": 1.3110991346911937e-07,
"logits/chosen": -109132339.2,
"logits/rejected": -124739571.2,
"logps/chosen": -497.4130859375,
"logps/rejected": -851.44716796875,
"loss": 0.0718,
"rewards/chosen": 6.165841293334961,
"rewards/margins": 32.20471458435059,
"rewards/rejected": -26.038873291015626,
"step": 2310
},
{
"epoch": 2.374616171954964,
"grad_norm": 14.048872684243365,
"kl": 31.685832977294922,
"learning_rate": 1.271151326594352e-07,
"logits/chosen": -113037340.44444445,
"logits/rejected": -119740986.18181819,
"logps/chosen": -560.6264105902778,
"logps/rejected": -703.0914417613636,
"loss": 0.1018,
"rewards/chosen": 4.181118435329861,
"rewards/margins": 16.667807222616794,
"rewards/rejected": -12.486688787286932,
"step": 2320
},
{
"epoch": 2.384851586489253,
"grad_norm": 11.284739170383132,
"kl": 81.66230773925781,
"learning_rate": 1.2317327541680644e-07,
"logits/chosen": -116141714.28571428,
"logits/rejected": -117831989.33333333,
"logps/chosen": -503.6156529017857,
"logps/rejected": -756.2096354166666,
"loss": 0.1345,
"rewards/chosen": 6.625304630824497,
"rewards/margins": 17.272850127447217,
"rewards/rejected": -10.64754549662272,
"step": 2330
},
{
"epoch": 2.3950870010235414,
"grad_norm": 13.720827077271856,
"kl": 65.03724670410156,
"learning_rate": 1.1928490120922014e-07,
"logits/chosen": -121709582.22222222,
"logits/rejected": -105234408.72727273,
"logps/chosen": -556.2324761284722,
"logps/rejected": -708.0031516335227,
"loss": 0.1302,
"rewards/chosen": 4.338587866889106,
"rewards/margins": 17.573104357478595,
"rewards/rejected": -13.234516490589488,
"step": 2340
},
{
"epoch": 2.40532241555783,
"grad_norm": 5.575592393255063,
"kl": 9.638999938964844,
"learning_rate": 1.1545056191381381e-07,
"logits/chosen": -93364021.33333333,
"logits/rejected": -152067344.0,
"logps/chosen": -450.686279296875,
"logps/rejected": -931.534912109375,
"loss": 0.1371,
"rewards/chosen": 5.051417350769043,
"rewards/margins": 26.019991874694824,
"rewards/rejected": -20.96857452392578,
"step": 2350
},
{
"epoch": 2.407369498464688,
"eval_logits/chosen": -111141848.0,
"eval_logits/rejected": -99338928.0,
"eval_logps/chosen": -484.7994079589844,
"eval_logps/rejected": -496.7482604980469,
"eval_loss": 0.28797829151153564,
"eval_rewards/chosen": 1.4960663318634033,
"eval_rewards/margins": -5.191570997238159,
"eval_rewards/rejected": 6.6876373291015625,
"eval_runtime": 2.6674,
"eval_samples_per_second": 3.749,
"eval_steps_per_second": 0.75,
"kl": 0.0,
"step": 2352
},
{
"epoch": 2.4155578300921188,
"grad_norm": 10.829951404513368,
"kl": 122.53065490722656,
"learning_rate": 1.1167080173854682e-07,
"logits/chosen": -103706450.28571428,
"logits/rejected": -114846109.53846154,
"logps/chosen": -502.20877511160717,
"logps/rejected": -735.6909555288462,
"loss": 0.1348,
"rewards/chosen": 6.153853280203683,
"rewards/margins": 23.16724580198854,
"rewards/rejected": -17.013392521784855,
"step": 2360
},
{
"epoch": 2.425793244626407,
"grad_norm": 15.679831732247864,
"kl": 102.68487548828125,
"learning_rate": 1.0794615714496174e-07,
"logits/chosen": -113436697.6,
"logits/rejected": -126385536.0,
"logps/chosen": -537.893603515625,
"logps/rejected": -781.82001953125,
"loss": 0.1383,
"rewards/chosen": 6.26806869506836,
"rewards/margins": 16.49968566894531,
"rewards/rejected": -10.231616973876953,
"step": 2370
},
{
"epoch": 2.436028659160696,
"grad_norm": 14.38159184226354,
"kl": 82.27883911132812,
"learning_rate": 1.042771567720438e-07,
"logits/chosen": -112610340.57142857,
"logits/rejected": -105989385.84615384,
"logps/chosen": -570.0341099330357,
"logps/rejected": -723.4311899038462,
"loss": 0.1201,
"rewards/chosen": 5.978327069963727,
"rewards/margins": 21.12158479795351,
"rewards/rejected": -15.143257727989784,
"step": 2380
},
{
"epoch": 2.4462640736949846,
"grad_norm": 0.6965533738726798,
"kl": 99.01249694824219,
"learning_rate": 1.0066432136119124e-07,
"logits/chosen": -110192921.6,
"logits/rejected": -127216371.2,
"logps/chosen": -529.600537109375,
"logps/rejected": -701.580419921875,
"loss": 0.116,
"rewards/chosen": 4.8227394104003904,
"rewards/margins": 16.614155578613282,
"rewards/rejected": -11.79141616821289,
"step": 2390
},
{
"epoch": 2.4564994882292734,
"grad_norm": 15.536905841649384,
"kl": 37.292198181152344,
"learning_rate": 9.710816368230718e-08,
"logits/chosen": -124482245.81818181,
"logits/rejected": -94917866.66666667,
"logps/chosen": -550.3764204545455,
"logps/rejected": -566.4765082465278,
"loss": 0.1288,
"rewards/chosen": 6.663654674183238,
"rewards/margins": 15.713337291370738,
"rewards/rejected": -9.0496826171875,
"step": 2400
},
{
"epoch": 2.466734902763562,
"grad_norm": 6.068120588196036,
"kl": 35.238346099853516,
"learning_rate": 9.360918846102056e-08,
"logits/chosen": -117002346.66666667,
"logits/rejected": -113970642.28571428,
"logps/chosen": -617.9042154947916,
"logps/rejected": -671.12451171875,
"loss": 0.1702,
"rewards/chosen": 5.355202356974284,
"rewards/margins": 11.924032574608212,
"rewards/rejected": -6.568830217633929,
"step": 2410
},
{
"epoch": 2.4769703172978508,
"grad_norm": 16.413330514370553,
"kl": 38.19875717163086,
"learning_rate": 9.016789230705218e-08,
"logits/chosen": -121840032.0,
"logits/rejected": -112829514.66666667,
"logps/chosen": -500.3292541503906,
"logps/rejected": -700.7330729166666,
"loss": 0.127,
"rewards/chosen": 6.467748641967773,
"rewards/margins": 23.09494972229004,
"rewards/rejected": -16.627201080322266,
"step": 2420
},
{
"epoch": 2.487205731832139,
"grad_norm": 13.868930669170362,
"kl": 4.333488464355469,
"learning_rate": 8.678476364372967e-08,
"logits/chosen": -133496746.66666667,
"logits/rejected": -99632568.0,
"logps/chosen": -603.1192220052084,
"logps/rejected": -568.6356201171875,
"loss": 0.1184,
"rewards/chosen": 3.938859303792318,
"rewards/margins": 12.045438130696615,
"rewards/rejected": -8.106578826904297,
"step": 2430
},
{
"epoch": 2.4974411463664277,
"grad_norm": 12.860085633024877,
"kl": 46.0883903503418,
"learning_rate": 8.346028263866606e-08,
"logits/chosen": -107161856.0,
"logits/rejected": -118059622.4,
"logps/chosen": -491.523388671875,
"logps/rejected": -718.28203125,
"loss": 0.11,
"rewards/chosen": 4.71644172668457,
"rewards/margins": 19.74509162902832,
"rewards/rejected": -15.02864990234375,
"step": 2440
},
{
"epoch": 2.5076765609007166,
"grad_norm": 14.911278979967584,
"kl": 10.185958862304688,
"learning_rate": 8.019492113560938e-08,
"logits/chosen": -114439645.86666666,
"logits/rejected": -130004211.2,
"logps/chosen": -514.1682291666667,
"logps/rejected": -731.38642578125,
"loss": 0.1235,
"rewards/chosen": 5.1857854207356775,
"rewards/margins": 20.735327657063802,
"rewards/rejected": -15.549542236328126,
"step": 2450
},
{
"epoch": 2.517911975435005,
"grad_norm": 10.345603971630045,
"kl": 43.71831130981445,
"learning_rate": 7.698914258747392e-08,
"logits/chosen": -97093056.0,
"logits/rejected": -104960490.66666667,
"logps/chosen": -479.16485595703125,
"logps/rejected": -591.1146647135416,
"loss": 0.1591,
"rewards/chosen": 5.103046417236328,
"rewards/margins": 6.8243058522542315,
"rewards/rejected": -1.7212594350179036,
"step": 2460
},
{
"epoch": 2.528147389969294,
"grad_norm": 8.021925150938095,
"kl": 102.18989562988281,
"learning_rate": 7.384340199056216e-08,
"logits/chosen": -133009488.0,
"logits/rejected": -111899104.0,
"logps/chosen": -588.400146484375,
"logps/rejected": -699.8717447916666,
"loss": 0.1252,
"rewards/chosen": 4.981169700622559,
"rewards/margins": 16.865241050720215,
"rewards/rejected": -11.884071350097656,
"step": 2470
},
{
"epoch": 2.5383828045035823,
"grad_norm": 14.884330124927732,
"kl": 105.43118286132812,
"learning_rate": 7.07581458199879e-08,
"logits/chosen": -111587126.85714285,
"logits/rejected": -114068164.92307693,
"logps/chosen": -524.8841378348214,
"logps/rejected": -845.0369591346154,
"loss": 0.0733,
"rewards/chosen": 5.159061431884766,
"rewards/margins": 30.266236818753757,
"rewards/rejected": -25.10717538686899,
"step": 2480
},
{
"epoch": 2.548618219037871,
"grad_norm": 9.74651017994561,
"kl": 117.86248779296875,
"learning_rate": 6.773381196630656e-08,
"logits/chosen": -121217838.54545455,
"logits/rejected": -122516280.8888889,
"logps/chosen": -530.4308860085227,
"logps/rejected": -772.5392795138889,
"loss": 0.1062,
"rewards/chosen": 7.943235917524858,
"rewards/margins": 24.32046030988597,
"rewards/rejected": -16.37722439236111,
"step": 2490
},
{
"epoch": 2.5588536335721597,
"grad_norm": 11.454867890880834,
"kl": 32.92910385131836,
"learning_rate": 6.477082967336689e-08,
"logits/chosen": -123549474.13333334,
"logits/rejected": -121342182.4,
"logps/chosen": -668.5967447916667,
"logps/rejected": -656.6751953125,
"loss": 0.1401,
"rewards/chosen": 2.778960418701172,
"rewards/margins": 7.843721389770508,
"rewards/rejected": -5.064760971069336,
"step": 2500
},
{
"epoch": 2.5690890481064486,
"grad_norm": 11.646385280367278,
"kl": 101.4727783203125,
"learning_rate": 6.186961947738739e-08,
"logits/chosen": -108983193.6,
"logits/rejected": -121287667.2,
"logps/chosen": -504.491748046875,
"logps/rejected": -775.789453125,
"loss": 0.1439,
"rewards/chosen": 5.482083511352539,
"rewards/margins": 27.428909683227538,
"rewards/rejected": -21.946826171875,
"step": 2510
},
{
"epoch": 2.579324462640737,
"grad_norm": 7.859714543172971,
"kl": 45.51924133300781,
"learning_rate": 5.903059314726988e-08,
"logits/chosen": -106612451.55555555,
"logits/rejected": -140557684.36363637,
"logps/chosen": -491.76817491319446,
"logps/rejected": -827.0592151988636,
"loss": 0.1053,
"rewards/chosen": 6.706490834554036,
"rewards/margins": 26.80151286269679,
"rewards/rejected": -20.095022028142754,
"step": 2520
},
{
"epoch": 2.5895598771750254,
"grad_norm": 4.937490929049706,
"kl": 10.572738647460938,
"learning_rate": 5.625415362615721e-08,
"logits/chosen": -116113109.33333333,
"logits/rejected": -102797296.0,
"logps/chosen": -558.3600260416666,
"logps/rejected": -646.0928955078125,
"loss": 0.1756,
"rewards/chosen": 3.8815214369032116,
"rewards/margins": 7.237435552808973,
"rewards/rejected": -3.3559141159057617,
"step": 2530
},
{
"epoch": 2.5997952917093143,
"grad_norm": 14.37678972130025,
"kl": 63.45878601074219,
"learning_rate": 5.354069497424335e-08,
"logits/chosen": -128418880.0,
"logits/rejected": -113965074.28571428,
"logps/chosen": -584.1385498046875,
"logps/rejected": -602.3302176339286,
"loss": 0.1687,
"rewards/chosen": 6.9789072672526045,
"rewards/margins": 13.387665158226376,
"rewards/rejected": -6.408757890973773,
"step": 2540
},
{
"epoch": 2.610030706243603,
"grad_norm": 10.459219539605776,
"kl": 67.96239471435547,
"learning_rate": 5.089060231284453e-08,
"logits/chosen": -125863469.71428572,
"logits/rejected": -103395268.92307693,
"logps/chosen": -521.3954380580357,
"logps/rejected": -616.2382061298077,
"loss": 0.1155,
"rewards/chosen": 5.0010577610560825,
"rewards/margins": 12.883576235928377,
"rewards/rejected": -7.882518474872295,
"step": 2550
},
{
"epoch": 2.6202661207778917,
"grad_norm": 3.7716822556149108,
"kl": 13.6009521484375,
"learning_rate": 4.830425176973918e-08,
"logits/chosen": -115673611.63636364,
"logits/rejected": -123093077.33333333,
"logps/chosen": -535.6490589488636,
"logps/rejected": -707.8030056423611,
"loss": 0.1083,
"rewards/chosen": 3.2927263433283027,
"rewards/margins": 18.8191172667224,
"rewards/rejected": -15.526390923394096,
"step": 2560
},
{
"epoch": 2.63050153531218,
"grad_norm": 11.540542432175807,
"kl": 49.550472259521484,
"learning_rate": 4.578201042578317e-08,
"logits/chosen": -127875677.0909091,
"logits/rejected": -106003847.1111111,
"logps/chosen": -603.5329367897727,
"logps/rejected": -758.7048611111111,
"loss": 0.1564,
"rewards/chosen": 5.201872045343572,
"rewards/margins": 21.771740345039753,
"rewards/rejected": -16.569868299696182,
"step": 2570
},
{
"epoch": 2.6407369498464686,
"grad_norm": 15.048238698409907,
"kl": 65.9299087524414,
"learning_rate": 4.3324236262811395e-08,
"logits/chosen": -112790442.66666667,
"logits/rejected": -142388864.0,
"logps/chosen": -601.592041015625,
"logps/rejected": -929.5491333007812,
"loss": 0.1373,
"rewards/chosen": 1.01103679339091,
"rewards/margins": 23.71454707781474,
"rewards/rejected": -22.703510284423828,
"step": 2580
},
{
"epoch": 2.6509723643807575,
"grad_norm": 14.536384887849032,
"kl": 79.07972717285156,
"learning_rate": 4.0931278112828203e-08,
"logits/chosen": -124037196.8,
"logits/rejected": -107391513.6,
"logps/chosen": -587.8376953125,
"logps/rejected": -705.137939453125,
"loss": 0.1315,
"rewards/chosen": 4.753457641601562,
"rewards/margins": 18.607870483398436,
"rewards/rejected": -13.854412841796876,
"step": 2590
},
{
"epoch": 2.661207778915046,
"grad_norm": 11.239515091047844,
"kl": 61.44083786010742,
"learning_rate": 3.860347560849836e-08,
"logits/chosen": -123588676.92307693,
"logits/rejected": -121233179.42857143,
"logps/chosen": -569.4657451923077,
"logps/rejected": -808.5862165178571,
"loss": 0.0885,
"rewards/chosen": 6.6277606670673075,
"rewards/margins": 27.442394717709046,
"rewards/rejected": -20.81463405064174,
"step": 2600
},
{
"epoch": 2.671443193449335,
"grad_norm": 17.077056073007693,
"kl": 24.01074981689453,
"learning_rate": 3.634115913494257e-08,
"logits/chosen": -128678567.38461539,
"logits/rejected": -114697645.71428572,
"logps/chosen": -596.2249474158654,
"logps/rejected": -722.2978515625,
"loss": 0.1256,
"rewards/chosen": 6.063357426570012,
"rewards/margins": 14.008555485652042,
"rewards/rejected": -7.945198059082031,
"step": 2610
},
{
"epoch": 2.6816786079836232,
"grad_norm": 7.060231943175802,
"kl": 2.9792327880859375,
"learning_rate": 3.414464978284609e-08,
"logits/chosen": -123749196.8,
"logits/rejected": -113550476.8,
"logps/chosen": -564.08203125,
"logps/rejected": -686.4732421875,
"loss": 0.1022,
"rewards/chosen": 4.978203582763672,
"rewards/margins": 14.521561431884766,
"rewards/rejected": -9.543357849121094,
"step": 2620
},
{
"epoch": 2.691914022517912,
"grad_norm": 14.640691296223196,
"kl": 102.84552001953125,
"learning_rate": 3.201425930288648e-08,
"logits/chosen": -104803337.14285715,
"logits/rejected": -97529668.92307693,
"logps/chosen": -538.1909528459821,
"logps/rejected": -575.7629206730769,
"loss": 0.1412,
"rewards/chosen": 5.508868081229074,
"rewards/margins": 13.19678711105179,
"rewards/rejected": -7.687919029822717,
"step": 2630
},
{
"epoch": 2.7021494370522006,
"grad_norm": 6.935672879419478,
"kl": 4.456298828125,
"learning_rate": 2.995029006148631e-08,
"logits/chosen": -114078361.6,
"logits/rejected": -108900300.8,
"logps/chosen": -508.643212890625,
"logps/rejected": -690.940185546875,
"loss": 0.1285,
"rewards/chosen": 4.7735595703125,
"rewards/margins": 19.15091552734375,
"rewards/rejected": -14.37735595703125,
"step": 2640
},
{
"epoch": 2.7082906857727735,
"eval_logits/chosen": -114141008.0,
"eval_logits/rejected": -101458704.0,
"eval_logps/chosen": -486.99053955078125,
"eval_logps/rejected": -501.12237548828125,
"eval_loss": 0.31646159291267395,
"eval_rewards/chosen": 1.276953101158142,
"eval_rewards/margins": -4.973272919654846,
"eval_rewards/rejected": 6.250226020812988,
"eval_runtime": 2.6383,
"eval_samples_per_second": 3.79,
"eval_steps_per_second": 0.758,
"kl": 0.0,
"step": 2646
},
{
"epoch": 2.7123848515864895,
"grad_norm": 10.887928094851356,
"kl": 8.261627197265625,
"learning_rate": 2.795303499789864e-08,
"logits/chosen": -116870853.81818181,
"logits/rejected": -107267861.33333333,
"logps/chosen": -509.50319602272725,
"logps/rejected": -655.4784071180555,
"loss": 0.0932,
"rewards/chosen": 4.065437316894531,
"rewards/margins": 13.481632656521267,
"rewards/rejected": -9.416195339626736,
"step": 2650
},
{
"epoch": 2.722620266120778,
"grad_norm": 1.6419687975273198,
"kl": 35.763301849365234,
"learning_rate": 2.6022777582630384e-08,
"logits/chosen": -112244152.8888889,
"logits/rejected": -112315066.18181819,
"logps/chosen": -522.056640625,
"logps/rejected": -778.3561789772727,
"loss": 0.132,
"rewards/chosen": 5.681062486436632,
"rewards/margins": 27.134175001972856,
"rewards/rejected": -21.453112515536223,
"step": 2660
},
{
"epoch": 2.7328556806550663,
"grad_norm": 9.109659129909929,
"kl": 69.41864013671875,
"learning_rate": 2.4159791777208728e-08,
"logits/chosen": -113096832.0,
"logits/rejected": -117938201.6,
"logps/chosen": -536.3591796875,
"logps/rejected": -802.1228515625,
"loss": 0.1411,
"rewards/chosen": 5.3816673278808596,
"rewards/margins": 26.51123580932617,
"rewards/rejected": -21.12956848144531,
"step": 2670
},
{
"epoch": 2.7430910951893552,
"grad_norm": 2.540946323619725,
"kl": 104.94831848144531,
"learning_rate": 2.236434199529813e-08,
"logits/chosen": -113226342.4,
"logits/rejected": -109490892.8,
"logps/chosen": -558.754833984375,
"logps/rejected": -850.9609375,
"loss": 0.1044,
"rewards/chosen": 4.071189117431641,
"rewards/margins": 33.206589508056645,
"rewards/rejected": -29.135400390625,
"step": 2680
},
{
"epoch": 2.7533265097236437,
"grad_norm": 4.993767586326576,
"kl": 71.74156188964844,
"learning_rate": 2.063668306517197e-08,
"logits/chosen": -120991760.0,
"logits/rejected": -126745472.0,
"logps/chosen": -594.7095947265625,
"logps/rejected": -762.1270345052084,
"loss": 0.1218,
"rewards/chosen": 3.4017386436462402,
"rewards/margins": 19.160385290781655,
"rewards/rejected": -15.758646647135416,
"step": 2690
},
{
"epoch": 2.7635619242579326,
"grad_norm": 10.093784018688082,
"kl": 109.51588439941406,
"learning_rate": 1.897706019354478e-08,
"logits/chosen": -111541806.54545455,
"logits/rejected": -112799928.8888889,
"logps/chosen": -518.2110262784091,
"logps/rejected": -666.8862847222222,
"loss": 0.1316,
"rewards/chosen": 6.8094329833984375,
"rewards/margins": 24.927536010742188,
"rewards/rejected": -18.11810302734375,
"step": 2700
},
{
"epoch": 2.773797338792221,
"grad_norm": 6.280834542580219,
"kl": 58.57270050048828,
"learning_rate": 1.7385708930770294e-08,
"logits/chosen": -116748681.14285715,
"logits/rejected": -133647261.53846154,
"logps/chosen": -565.2002999441964,
"logps/rejected": -752.8988882211538,
"loss": 0.104,
"rewards/chosen": 7.381816319056919,
"rewards/margins": 21.23567702744033,
"rewards/rejected": -13.853860708383413,
"step": 2710
},
{
"epoch": 2.7840327533265095,
"grad_norm": 8.201048256790347,
"kl": 107.66532897949219,
"learning_rate": 1.5862855137409203e-08,
"logits/chosen": -115046613.33333333,
"logits/rejected": -128310921.14285715,
"logps/chosen": -652.1544596354166,
"logps/rejected": -716.2571149553571,
"loss": 0.1398,
"rewards/chosen": 0.37717580795288086,
"rewards/margins": 6.230465275900705,
"rewards/rejected": -5.853289467947824,
"step": 2720
},
{
"epoch": 2.7942681678607983,
"grad_norm": 13.370009278983177,
"kl": 48.944252014160156,
"learning_rate": 1.4408714952173162e-08,
"logits/chosen": -105869802.66666667,
"logits/rejected": -128118326.85714285,
"logps/chosen": -429.5232747395833,
"logps/rejected": -792.2711356026786,
"loss": 0.1207,
"rewards/chosen": 6.8134206136067705,
"rewards/margins": 18.571839105515252,
"rewards/rejected": -11.758418491908483,
"step": 2730
},
{
"epoch": 2.8045035823950872,
"grad_norm": 14.217809187421249,
"kl": 5.711326599121094,
"learning_rate": 1.3023494761248422e-08,
"logits/chosen": -137282560.0,
"logits/rejected": -104739466.66666667,
"logps/chosen": -653.2716064453125,
"logps/rejected": -704.67724609375,
"loss": 0.1089,
"rewards/chosen": 6.01470947265625,
"rewards/margins": 23.953670501708984,
"rewards/rejected": -17.938961029052734,
"step": 2740
},
{
"epoch": 2.8147389969293757,
"grad_norm": 19.336026642511733,
"kl": 2.7219467163085938,
"learning_rate": 1.1707391169002767e-08,
"logits/chosen": -134308893.53846154,
"logits/rejected": -110243748.57142857,
"logps/chosen": -623.1481370192307,
"logps/rejected": -802.8217075892857,
"loss": 0.1541,
"rewards/chosen": 2.4775123596191406,
"rewards/margins": 22.423845018659318,
"rewards/rejected": -19.946332659040177,
"step": 2750
},
{
"epoch": 2.824974411463664,
"grad_norm": 8.72024018742674,
"kl": 154.34445190429688,
"learning_rate": 1.0460590970082062e-08,
"logits/chosen": -115301104.0,
"logits/rejected": -118230410.66666667,
"logps/chosen": -532.9121704101562,
"logps/rejected": -702.4969075520834,
"loss": 0.1176,
"rewards/chosen": 4.131191730499268,
"rewards/margins": 18.579192320505776,
"rewards/rejected": -14.44800059000651,
"step": 2760
},
{
"epoch": 2.835209825997953,
"grad_norm": 20.017577796935008,
"kl": 86.5708999633789,
"learning_rate": 9.283271122898172e-09,
"logits/chosen": -125055464.72727273,
"logits/rejected": -112441578.66666667,
"logps/chosen": -558.66748046875,
"logps/rejected": -690.9308810763889,
"loss": 0.138,
"rewards/chosen": 5.285506855357777,
"rewards/margins": 12.411841151690243,
"rewards/rejected": -7.126334296332465,
"step": 2770
},
{
"epoch": 2.8454452405322415,
"grad_norm": 5.245051575737728,
"kl": 35.114925384521484,
"learning_rate": 8.175598724513234e-09,
"logits/chosen": -109534720.0,
"logits/rejected": -102300536.8888889,
"logps/chosen": -508.27436967329544,
"logps/rejected": -593.6958550347222,
"loss": 0.1155,
"rewards/chosen": 6.462843461470171,
"rewards/margins": 12.150792748037011,
"rewards/rejected": -5.68794928656684,
"step": 2780
},
{
"epoch": 2.8556806550665303,
"grad_norm": 7.518000103434843,
"kl": 38.176666259765625,
"learning_rate": 7.137730986923829e-09,
"logits/chosen": -124864187.07692307,
"logits/rejected": -117530971.42857143,
"logps/chosen": -513.7694936899038,
"logps/rejected": -756.7769252232143,
"loss": 0.105,
"rewards/chosen": 5.097284170297476,
"rewards/margins": 19.500482412484978,
"rewards/rejected": -14.4031982421875,
"step": 2790
},
{
"epoch": 2.865916069600819,
"grad_norm": 7.323346052279645,
"kl": 82.49562072753906,
"learning_rate": 6.1698152147475755e-09,
"logits/chosen": -126298890.66666667,
"logits/rejected": -132990744.0,
"logps/chosen": -573.87841796875,
"logps/rejected": -943.2960205078125,
"loss": 0.1373,
"rewards/chosen": 5.581108729044597,
"rewards/margins": 32.77851931254069,
"rewards/rejected": -27.197410583496094,
"step": 2800
},
{
"epoch": 2.8761514841351072,
"grad_norm": 10.36267978448859,
"kl": 61.939781188964844,
"learning_rate": 5.271988784316172e-09,
"logits/chosen": -123898256.0,
"logits/rejected": -113120874.66666667,
"logps/chosen": -540.285888671875,
"logps/rejected": -723.02099609375,
"loss": 0.1174,
"rewards/chosen": 6.676476955413818,
"rewards/margins": 20.52393356959025,
"rewards/rejected": -13.847456614176432,
"step": 2810
},
{
"epoch": 2.886386898669396,
"grad_norm": 13.425727534613003,
"kl": 77.56233215332031,
"learning_rate": 4.444379124178055e-09,
"logits/chosen": -122654720.0,
"logits/rejected": -109599568.0,
"logps/chosen": -599.6444905598959,
"logps/rejected": -687.8331298828125,
"loss": 0.168,
"rewards/chosen": 0.1978003184000651,
"rewards/margins": 3.7757269541422525,
"rewards/rejected": -3.5779266357421875,
"step": 2820
},
{
"epoch": 2.896622313203685,
"grad_norm": 15.4066210292021,
"kl": 84.69986724853516,
"learning_rate": 3.6871036970116952e-09,
"logits/chosen": -146604400.0,
"logits/rejected": -113729952.0,
"logps/chosen": -643.28515625,
"logps/rejected": -616.5157877604166,
"loss": 0.1298,
"rewards/chosen": 4.322854995727539,
"rewards/margins": 12.591309229532877,
"rewards/rejected": -8.268454233805338,
"step": 2830
},
{
"epoch": 2.9068577277379735,
"grad_norm": 9.219603587639986,
"kl": 79.7108154296875,
"learning_rate": 3.000269982954773e-09,
"logits/chosen": -115262836.36363636,
"logits/rejected": -134471665.7777778,
"logps/chosen": -507.20157137784093,
"logps/rejected": -831.7874348958334,
"loss": 0.1479,
"rewards/chosen": 3.993692571466619,
"rewards/margins": 21.894585041084675,
"rewards/rejected": -17.900892469618057,
"step": 2840
},
{
"epoch": 2.917093142272262,
"grad_norm": 16.91311490037144,
"kl": 77.60739135742188,
"learning_rate": 2.3839754643491526e-09,
"logits/chosen": -112859029.33333333,
"logits/rejected": -115164448.0,
"logps/chosen": -526.4039306640625,
"logps/rejected": -760.6889038085938,
"loss": 0.1223,
"rewards/chosen": 4.538237889607747,
"rewards/margins": 15.784871419270832,
"rewards/rejected": -11.246633529663086,
"step": 2850
},
{
"epoch": 2.927328556806551,
"grad_norm": 8.339427757770848,
"kl": 5.040294647216797,
"learning_rate": 1.838307611905343e-09,
"logits/chosen": -115087931.07692307,
"logits/rejected": -111349010.28571428,
"logps/chosen": -522.8479567307693,
"logps/rejected": -736.3690011160714,
"loss": 0.1385,
"rewards/chosen": 3.7734222412109375,
"rewards/margins": 20.419662475585938,
"rewards/rejected": -16.646240234375,
"step": 2860
},
{
"epoch": 2.9375639713408392,
"grad_norm": 2.308264911894697,
"kl": 47.0976676940918,
"learning_rate": 1.3633438722877033e-09,
"logits/chosen": -123765944.0,
"logits/rejected": -125191381.33333333,
"logps/chosen": -528.5211181640625,
"logps/rejected": -757.469482421875,
"loss": 0.1018,
"rewards/chosen": 5.355100631713867,
"rewards/margins": 23.6528263092041,
"rewards/rejected": -18.297725677490234,
"step": 2870
},
{
"epoch": 2.947799385875128,
"grad_norm": 10.925389055878155,
"kl": 47.42646408081055,
"learning_rate": 9.5915165712257e-10,
"logits/chosen": -115053260.8,
"logits/rejected": -94018624.0,
"logps/chosen": -509.125390625,
"logps/rejected": -618.67490234375,
"loss": 0.1529,
"rewards/chosen": 5.673241424560547,
"rewards/margins": 16.379257202148438,
"rewards/rejected": -10.70601577758789,
"step": 2880
},
{
"epoch": 2.9580348004094166,
"grad_norm": 12.802240055553504,
"kl": 80.9378662109375,
"learning_rate": 6.257883334302994e-10,
"logits/chosen": -112381975.27272727,
"logits/rejected": -109818197.33333333,
"logps/chosen": -535.2524857954545,
"logps/rejected": -599.8187391493055,
"loss": 0.1487,
"rewards/chosen": 5.386872725053267,
"rewards/margins": 6.529768317636817,
"rewards/rejected": -1.1428955925835504,
"step": 2890
},
{
"epoch": 2.968270214943705,
"grad_norm": 11.03345507942441,
"kl": 18.87073516845703,
"learning_rate": 3.6330121548344696e-10,
"logits/chosen": -119768729.6,
"logits/rejected": -113732620.8,
"logps/chosen": -533.147265625,
"logps/rejected": -648.48359375,
"loss": 0.1007,
"rewards/chosen": 5.657349395751953,
"rewards/margins": 18.19538269042969,
"rewards/rejected": -12.538033294677735,
"step": 2900
},
{
"epoch": 2.978505629477994,
"grad_norm": 5.957476587134599,
"kl": 15.592277526855469,
"learning_rate": 1.7172755809119476e-10,
"logits/chosen": -118812183.27272727,
"logits/rejected": -118075306.66666667,
"logps/chosen": -516.0139382102273,
"logps/rejected": -703.6335720486111,
"loss": 0.1288,
"rewards/chosen": 5.660085851495916,
"rewards/margins": 22.83608072454279,
"rewards/rejected": -17.175994873046875,
"step": 2910
},
{
"epoch": 2.9887410440122824,
"grad_norm": 8.59267719952877,
"kl": 68.4888687133789,
"learning_rate": 5.10945513118588e-11,
"logits/chosen": -133802453.33333333,
"logits/rejected": -118769832.0,
"logps/chosen": -579.6878662109375,
"logps/rejected": -798.2313232421875,
"loss": 0.1533,
"rewards/chosen": 4.62050183614095,
"rewards/margins": 28.16325314839681,
"rewards/rejected": -23.54275131225586,
"step": 2920
},
{
"epoch": 2.9989764585465712,
"grad_norm": 9.368139533103754,
"kl": 44.17957305908203,
"learning_rate": 1.419316593864739e-12,
"logits/chosen": -123620633.6,
"logits/rejected": -123439654.4,
"logps/chosen": -555.02197265625,
"logps/rejected": -822.5419921875,
"loss": 0.1122,
"rewards/chosen": 5.326744079589844,
"rewards/margins": 26.535008239746094,
"rewards/rejected": -21.20826416015625,
"step": 2930
},
{
"epoch": 3.0,
"step": 2931,
"total_flos": 6.959030383253914e+16,
"train_loss": 0.2375620225312888,
"train_runtime": 17678.1972,
"train_samples_per_second": 2.652,
"train_steps_per_second": 0.166
}
],
"logging_steps": 10,
"max_steps": 2931,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 294,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 6.959030383253914e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}