GuardReasoner-1B-HS-DPO / trainer_state.json
6Amber6's picture
Upload folder using huggingface_hub
e840961 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 5676,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.007049545084043795,
"grad_norm": 5.251685782291382,
"learning_rate": 4.999861761763694e-06,
"logits/chosen": -2.4226253032684326,
"logits/rejected": -2.0827410221099854,
"logps/chosen": -369.7997131347656,
"logps/rejected": -72.51252746582031,
"loss": 2.7826,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 3.4676175117492676,
"rewards/margins": 3.82804799079895,
"rewards/rejected": -0.36043041944503784,
"step": 20
},
{
"epoch": 0.01409909016808759,
"grad_norm": 4.303040759809255,
"learning_rate": 4.999417578584734e-06,
"logits/chosen": -2.213839292526245,
"logits/rejected": -1.9602348804473877,
"logps/chosen": -219.6575164794922,
"logps/rejected": -121.3716049194336,
"loss": 1.5829,
"rewards/accuracies": 1.0,
"rewards/chosen": 4.987636089324951,
"rewards/margins": 5.836641788482666,
"rewards/rejected": -0.8490053415298462,
"step": 40
},
{
"epoch": 0.021148635252131384,
"grad_norm": 4.576766531423992,
"learning_rate": 4.998667121957487e-06,
"logits/chosen": -2.1551291942596436,
"logits/rejected": -1.8213732242584229,
"logps/chosen": -195.5138397216797,
"logps/rejected": -153.83941650390625,
"loss": 1.4187,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.166686058044434,
"rewards/margins": 6.340417385101318,
"rewards/rejected": -1.1737314462661743,
"step": 60
},
{
"epoch": 0.02819818033617518,
"grad_norm": 4.045608043687929,
"learning_rate": 4.997610483841349e-06,
"logits/chosen": -2.1854336261749268,
"logits/rejected": -1.8104079961776733,
"logps/chosen": -181.33554077148438,
"logps/rejected": -178.6750946044922,
"loss": 1.3198,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.2234673500061035,
"rewards/margins": 6.645481109619141,
"rewards/rejected": -1.4220136404037476,
"step": 80
},
{
"epoch": 0.03524772542021898,
"grad_norm": 4.309056287854017,
"learning_rate": 4.996247793714565e-06,
"logits/chosen": -2.153714418411255,
"logits/rejected": -1.7523826360702515,
"logps/chosen": -181.3177947998047,
"logps/rejected": -197.3472442626953,
"loss": 1.3159,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.32370662689209,
"rewards/margins": 6.932420253753662,
"rewards/rejected": -1.6087143421173096,
"step": 100
},
{
"epoch": 0.04229727050426277,
"grad_norm": 4.38954998222076,
"learning_rate": 4.994579218558358e-06,
"logits/chosen": -2.1097121238708496,
"logits/rejected": -1.6037172079086304,
"logps/chosen": -171.5686798095703,
"logps/rejected": -214.49600219726562,
"loss": 1.2613,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.3470964431762695,
"rewards/margins": 7.127236843109131,
"rewards/rejected": -1.7801411151885986,
"step": 120
},
{
"epoch": 0.049346815588306565,
"grad_norm": 4.170152738327058,
"learning_rate": 4.992604962836471e-06,
"logits/chosen": -2.061870574951172,
"logits/rejected": -1.5608799457550049,
"logps/chosen": -175.2593231201172,
"logps/rejected": -227.2431182861328,
"loss": 1.2586,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.457849979400635,
"rewards/margins": 7.365653991699219,
"rewards/rejected": -1.9078038930892944,
"step": 140
},
{
"epoch": 0.05639636067235036,
"grad_norm": 3.7793931281850064,
"learning_rate": 4.990325268470103e-06,
"logits/chosen": -2.0291988849639893,
"logits/rejected": -1.5090599060058594,
"logps/chosen": -170.09222412109375,
"logps/rejected": -238.5521240234375,
"loss": 1.2228,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.505599021911621,
"rewards/margins": 7.526421546936035,
"rewards/rejected": -2.020822525024414,
"step": 160
},
{
"epoch": 0.06344590575639415,
"grad_norm": 4.03339809015517,
"learning_rate": 4.987740414808279e-06,
"logits/chosen": -2.0087292194366455,
"logits/rejected": -1.4583772420883179,
"logps/chosen": -171.950927734375,
"logps/rejected": -249.205810546875,
"loss": 1.2355,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.471736431121826,
"rewards/margins": 7.599116802215576,
"rewards/rejected": -2.1273796558380127,
"step": 180
},
{
"epoch": 0.07049545084043796,
"grad_norm": 3.8396983397552003,
"learning_rate": 4.9848507185936054e-06,
"logits/chosen": -1.9715420007705688,
"logits/rejected": -1.3983051776885986,
"logps/chosen": -167.9102020263672,
"logps/rejected": -258.29931640625,
"loss": 1.2011,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.551548957824707,
"rewards/margins": 7.769871711730957,
"rewards/rejected": -2.218322992324829,
"step": 200
},
{
"epoch": 0.07754499592448175,
"grad_norm": 3.8300256337552883,
"learning_rate": 4.9816565339234665e-06,
"logits/chosen": -1.9795246124267578,
"logits/rejected": -1.4027624130249023,
"logps/chosen": -159.7227783203125,
"logps/rejected": -267.54132080078125,
"loss": 1.1606,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.484537601470947,
"rewards/margins": 7.795154094696045,
"rewards/rejected": -2.3106157779693604,
"step": 220
},
{
"epoch": 0.08459454100852554,
"grad_norm": 3.6985177789621537,
"learning_rate": 4.978158252206628e-06,
"logits/chosen": -1.8784259557724,
"logits/rejected": -1.339755892753601,
"logps/chosen": -159.25930786132812,
"logps/rejected": -274.899658203125,
"loss": 1.1487,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.546479225158691,
"rewards/margins": 7.930718898773193,
"rewards/rejected": -2.384239912033081,
"step": 240
},
{
"epoch": 0.09164408609256934,
"grad_norm": 4.180449865034002,
"learning_rate": 4.9743563021152815e-06,
"logits/chosen": -1.8454569578170776,
"logits/rejected": -1.3575037717819214,
"logps/chosen": -160.8242950439453,
"logps/rejected": -281.50543212890625,
"loss": 1.1609,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.501753330230713,
"rewards/margins": 7.952068328857422,
"rewards/rejected": -2.45031476020813,
"step": 260
},
{
"epoch": 0.09869363117661313,
"grad_norm": 4.150749378416668,
"learning_rate": 4.970251149532508e-06,
"logits/chosen": -1.867268443107605,
"logits/rejected": -1.3786919116973877,
"logps/chosen": -162.6690673828125,
"logps/rejected": -284.708984375,
"loss": 1.1587,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.61737585067749,
"rewards/margins": 8.099893569946289,
"rewards/rejected": -2.4825170040130615,
"step": 280
},
{
"epoch": 0.10574317626065693,
"grad_norm": 3.9481813668439925,
"learning_rate": 4.965843297495193e-06,
"logits/chosen": -1.8595079183578491,
"logits/rejected": -1.3652888536453247,
"logps/chosen": -160.74594116210938,
"logps/rejected": -290.0582275390625,
"loss": 1.1566,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.584461212158203,
"rewards/margins": 8.120338439941406,
"rewards/rejected": -2.535876989364624,
"step": 300
},
{
"epoch": 0.11279272134470072,
"grad_norm": 3.892019725921358,
"learning_rate": 4.9611332861323875e-06,
"logits/chosen": -1.815410852432251,
"logits/rejected": -1.345049262046814,
"logps/chosen": -158.49656677246094,
"logps/rejected": -296.3229675292969,
"loss": 1.1334,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.662566661834717,
"rewards/margins": 8.261143684387207,
"rewards/rejected": -2.5985770225524902,
"step": 320
},
{
"epoch": 0.11984226642874451,
"grad_norm": 4.074050442543863,
"learning_rate": 4.956121692599119e-06,
"logits/chosen": -1.8308916091918945,
"logits/rejected": -1.3203500509262085,
"logps/chosen": -156.52281188964844,
"logps/rejected": -301.0868225097656,
"loss": 1.1271,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.592663288116455,
"rewards/margins": 8.23884105682373,
"rewards/rejected": -2.6461777687072754,
"step": 340
},
{
"epoch": 0.1268918115127883,
"grad_norm": 3.8941695396480807,
"learning_rate": 4.95080913100567e-06,
"logits/chosen": -1.7772724628448486,
"logits/rejected": -1.2583340406417847,
"logps/chosen": -152.46060180664062,
"logps/rejected": -305.64990234375,
"loss": 1.1067,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.578180313110352,
"rewards/margins": 8.269899368286133,
"rewards/rejected": -2.6917190551757812,
"step": 360
},
{
"epoch": 0.13394135659683212,
"grad_norm": 3.986642176301124,
"learning_rate": 4.945196252342323e-06,
"logits/chosen": -1.746899962425232,
"logits/rejected": -1.252589464187622,
"logps/chosen": -156.21347045898438,
"logps/rejected": -308.6796569824219,
"loss": 1.1189,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.675221920013428,
"rewards/margins": 8.397418975830078,
"rewards/rejected": -2.7221975326538086,
"step": 380
},
{
"epoch": 0.1409909016808759,
"grad_norm": 3.835238186793769,
"learning_rate": 4.9392837443995935e-06,
"logits/chosen": -1.7575358152389526,
"logits/rejected": -1.2477468252182007,
"logps/chosen": -152.92015075683594,
"logps/rejected": -313.30816650390625,
"loss": 1.0949,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.6858229637146,
"rewards/margins": 8.454258918762207,
"rewards/rejected": -2.76843523979187,
"step": 400
},
{
"epoch": 0.1480404467649197,
"grad_norm": 3.7735268672320554,
"learning_rate": 4.933072331683947e-06,
"logits/chosen": -1.7370452880859375,
"logits/rejected": -1.234609603881836,
"logps/chosen": -157.04910278320312,
"logps/rejected": -316.3162536621094,
"loss": 1.1293,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.680102825164795,
"rewards/margins": 8.478598594665527,
"rewards/rejected": -2.7984960079193115,
"step": 420
},
{
"epoch": 0.1550899918489635,
"grad_norm": 3.5998804416212145,
"learning_rate": 4.9265627753290195e-06,
"logits/chosen": -1.7116130590438843,
"logits/rejected": -1.1605967283248901,
"logps/chosen": -153.48583984375,
"logps/rejected": -319.1722412109375,
"loss": 1.1021,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.628140926361084,
"rewards/margins": 8.455215454101562,
"rewards/rejected": -2.827075481414795,
"step": 440
},
{
"epoch": 0.16213953693300728,
"grad_norm": 3.619213515730465,
"learning_rate": 4.9197558730023524e-06,
"logits/chosen": -1.725412368774414,
"logits/rejected": -1.16403067111969,
"logps/chosen": -149.7213897705078,
"logps/rejected": -323.0830383300781,
"loss": 1.0849,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.608808994293213,
"rewards/margins": 8.474872589111328,
"rewards/rejected": -2.8660638332366943,
"step": 460
},
{
"epoch": 0.16918908201705107,
"grad_norm": 3.8639522615901845,
"learning_rate": 4.912652458807642e-06,
"logits/chosen": -1.6482775211334229,
"logits/rejected": -1.121797800064087,
"logps/chosen": -151.90098571777344,
"logps/rejected": -325.0082092285156,
"loss": 1.0933,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.6815924644470215,
"rewards/margins": 8.567051887512207,
"rewards/rejected": -2.8854596614837646,
"step": 480
},
{
"epoch": 0.1762386271010949,
"grad_norm": 3.6715917089410746,
"learning_rate": 4.905253403182541e-06,
"logits/chosen": -1.6202948093414307,
"logits/rejected": -1.0874967575073242,
"logps/chosen": -152.82394409179688,
"logps/rejected": -328.7615051269531,
"loss": 1.0932,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.710689067840576,
"rewards/margins": 8.633631706237793,
"rewards/rejected": -2.922943353652954,
"step": 500
},
{
"epoch": 0.18328817218513868,
"grad_norm": 3.6185403170755275,
"learning_rate": 4.897559612791982e-06,
"logits/chosen": -1.6128886938095093,
"logits/rejected": -1.08529531955719,
"logps/chosen": -150.95596313476562,
"logps/rejected": -332.5636901855469,
"loss": 1.0892,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.6448516845703125,
"rewards/margins": 8.605791091918945,
"rewards/rejected": -2.9609391689300537,
"step": 520
},
{
"epoch": 0.19033771726918247,
"grad_norm": 4.961359246548627,
"learning_rate": 4.889572030417091e-06,
"logits/chosen": -1.5982520580291748,
"logits/rejected": -1.0809301137924194,
"logps/chosen": -145.88729858398438,
"logps/rejected": -337.07427978515625,
"loss": 1.0691,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.617259979248047,
"rewards/margins": 8.623309135437012,
"rewards/rejected": -3.0060489177703857,
"step": 540
},
{
"epoch": 0.19738726235322626,
"grad_norm": 3.828455969236192,
"learning_rate": 4.881291634839652e-06,
"logits/chosen": -1.6171013116836548,
"logits/rejected": -1.136992335319519,
"logps/chosen": -148.6068572998047,
"logps/rejected": -330.5426025390625,
"loss": 1.0661,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.703982830047607,
"rewards/margins": 8.644716262817383,
"rewards/rejected": -2.940732479095459,
"step": 560
},
{
"epoch": 0.20443680743727005,
"grad_norm": 3.9472389539377692,
"learning_rate": 4.872719440722171e-06,
"logits/chosen": -1.626006007194519,
"logits/rejected": -1.1716492176055908,
"logps/chosen": -145.8668670654297,
"logps/rejected": -331.05413818359375,
"loss": 1.0442,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.7226738929748535,
"rewards/margins": 8.668530464172363,
"rewards/rejected": -2.945856809616089,
"step": 580
},
{
"epoch": 0.21148635252131387,
"grad_norm": 3.581652653947185,
"learning_rate": 4.863856498483545e-06,
"logits/chosen": -1.5996553897857666,
"logits/rejected": -1.1265536546707153,
"logps/chosen": -147.13487243652344,
"logps/rejected": -336.3114929199219,
"loss": 1.0497,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.683503150939941,
"rewards/margins": 8.681916236877441,
"rewards/rejected": -2.9984130859375,
"step": 600
},
{
"epoch": 0.21853589760535766,
"grad_norm": 3.6006765539179186,
"learning_rate": 4.854703894170342e-06,
"logits/chosen": -1.5823760032653809,
"logits/rejected": -1.0852136611938477,
"logps/chosen": -143.90567016601562,
"logps/rejected": -339.850830078125,
"loss": 1.0334,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.708090782165527,
"rewards/margins": 8.741941452026367,
"rewards/rejected": -3.0338504314422607,
"step": 620
},
{
"epoch": 0.22558544268940145,
"grad_norm": 3.673035488034299,
"learning_rate": 4.845262749323716e-06,
"logits/chosen": -1.5890778303146362,
"logits/rejected": -1.125832200050354,
"logps/chosen": -144.8064422607422,
"logps/rejected": -342.69403076171875,
"loss": 1.0435,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.719069957733154,
"rewards/margins": 8.781278610229492,
"rewards/rejected": -3.0622079372406006,
"step": 640
},
{
"epoch": 0.23263498777344524,
"grad_norm": 3.7341178394643464,
"learning_rate": 4.835534220841986e-06,
"logits/chosen": -1.5730316638946533,
"logits/rejected": -1.0738062858581543,
"logps/chosen": -147.9169158935547,
"logps/rejected": -345.677978515625,
"loss": 1.0506,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.784816741943359,
"rewards/margins": 8.876952171325684,
"rewards/rejected": -3.092135190963745,
"step": 660
},
{
"epoch": 0.23968453285748903,
"grad_norm": 3.5484016113310126,
"learning_rate": 4.825519500838861e-06,
"logits/chosen": -1.5520037412643433,
"logits/rejected": -1.0344088077545166,
"logps/chosen": -143.9144287109375,
"logps/rejected": -348.7065124511719,
"loss": 1.0334,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.744712829589844,
"rewards/margins": 8.867079734802246,
"rewards/rejected": -3.122366189956665,
"step": 680
},
{
"epoch": 0.24673407794153285,
"grad_norm": 3.9857566294011275,
"learning_rate": 4.815219816497369e-06,
"logits/chosen": -1.5165714025497437,
"logits/rejected": -0.994964599609375,
"logps/chosen": -144.06155395507812,
"logps/rejected": -351.51776123046875,
"loss": 1.0338,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.70768928527832,
"rewards/margins": 8.858179092407227,
"rewards/rejected": -3.150489091873169,
"step": 700
},
{
"epoch": 0.2537836230255766,
"grad_norm": 3.6922569534263223,
"learning_rate": 4.804636429919477e-06,
"logits/chosen": -1.5148602724075317,
"logits/rejected": -0.9885136485099792,
"logps/chosen": -147.0210418701172,
"logps/rejected": -354.9969177246094,
"loss": 1.0549,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.779511451721191,
"rewards/margins": 8.96484375,
"rewards/rejected": -3.1853320598602295,
"step": 720
},
{
"epoch": 0.26083316810962043,
"grad_norm": 3.7203570683891285,
"learning_rate": 4.793770637971432e-06,
"logits/chosen": -1.4832053184509277,
"logits/rejected": -0.9658412337303162,
"logps/chosen": -142.5421600341797,
"logps/rejected": -355.7452697753906,
"loss": 1.0267,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.72859525680542,
"rewards/margins": 8.921265602111816,
"rewards/rejected": -3.19266939163208,
"step": 740
},
{
"epoch": 0.26788271319366425,
"grad_norm": 3.523981090385713,
"learning_rate": 4.782623772124854e-06,
"logits/chosen": -1.4618552923202515,
"logits/rejected": -1.0074139833450317,
"logps/chosen": -143.67218017578125,
"logps/rejected": -357.75860595703125,
"loss": 1.0341,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.69673490524292,
"rewards/margins": 8.909579277038574,
"rewards/rejected": -3.212844133377075,
"step": 760
},
{
"epoch": 0.274932258277708,
"grad_norm": 3.543830337375067,
"learning_rate": 4.771197198293574e-06,
"logits/chosen": -1.439631700515747,
"logits/rejected": -0.9814452528953552,
"logps/chosen": -143.27450561523438,
"logps/rejected": -361.0159606933594,
"loss": 1.0287,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.738966464996338,
"rewards/margins": 8.98444938659668,
"rewards/rejected": -3.245483160018921,
"step": 780
},
{
"epoch": 0.2819818033617518,
"grad_norm": 3.7840061115601817,
"learning_rate": 4.7594923166662604e-06,
"logits/chosen": -1.4854661226272583,
"logits/rejected": -0.973606288433075,
"logps/chosen": -142.18592834472656,
"logps/rejected": -361.9082946777344,
"loss": 1.0202,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.734087944030762,
"rewards/margins": 8.988512992858887,
"rewards/rejected": -3.2544243335723877,
"step": 800
},
{
"epoch": 0.2890313484457956,
"grad_norm": 3.444393315103109,
"learning_rate": 4.74751056153484e-06,
"logits/chosen": -1.4658998250961304,
"logits/rejected": -0.9327756762504578,
"logps/chosen": -138.7262420654297,
"logps/rejected": -363.95166015625,
"loss": 1.0027,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.733180522918701,
"rewards/margins": 9.007993698120117,
"rewards/rejected": -3.274812698364258,
"step": 820
},
{
"epoch": 0.2960808935298394,
"grad_norm": 3.565026142089265,
"learning_rate": 4.7352534011187465e-06,
"logits/chosen": -1.468241572380066,
"logits/rejected": -0.9021957516670227,
"logps/chosen": -140.53086853027344,
"logps/rejected": -365.4902038574219,
"loss": 1.0082,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.738412380218506,
"rewards/margins": 9.028605461120605,
"rewards/rejected": -3.2901928424835205,
"step": 840
},
{
"epoch": 0.3031304386138832,
"grad_norm": 3.896801938758768,
"learning_rate": 4.722722337385005e-06,
"logits/chosen": -1.4516186714172363,
"logits/rejected": -0.9541074633598328,
"logps/chosen": -139.03001403808594,
"logps/rejected": -366.9988098144531,
"loss": 1.0077,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.699275016784668,
"rewards/margins": 9.004546165466309,
"rewards/rejected": -3.3052709102630615,
"step": 860
},
{
"epoch": 0.310179983697927,
"grad_norm": 3.502095791512193,
"learning_rate": 4.709918905864188e-06,
"logits/chosen": -1.3555845022201538,
"logits/rejected": -0.9211882948875427,
"logps/chosen": -144.34078979492188,
"logps/rejected": -366.9481506347656,
"loss": 1.038,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.776908874511719,
"rewards/margins": 9.081772804260254,
"rewards/rejected": -3.3048646450042725,
"step": 880
},
{
"epoch": 0.3172295287819708,
"grad_norm": 3.6471771757282254,
"learning_rate": 4.696844675462248e-06,
"logits/chosen": -1.377302646636963,
"logits/rejected": -0.9443565607070923,
"logps/chosen": -140.3704376220703,
"logps/rejected": -363.6595153808594,
"loss": 1.0094,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.706255912780762,
"rewards/margins": 8.9780912399292,
"rewards/rejected": -3.2718353271484375,
"step": 900
},
{
"epoch": 0.32427907386601457,
"grad_norm": 3.6615699755313194,
"learning_rate": 4.683501248268274e-06,
"logits/chosen": -1.3805917501449585,
"logits/rejected": -0.9572538733482361,
"logps/chosen": -138.28404235839844,
"logps/rejected": -365.3840026855469,
"loss": 0.999,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.751240253448486,
"rewards/margins": 9.04038143157959,
"rewards/rejected": -3.2891411781311035,
"step": 920
},
{
"epoch": 0.3313286189500584,
"grad_norm": 3.318362750466131,
"learning_rate": 4.66989025935817e-06,
"logits/chosen": -1.3741607666015625,
"logits/rejected": -0.9632787108421326,
"logps/chosen": -140.46435546875,
"logps/rejected": -369.9010314941406,
"loss": 1.0078,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.778801918029785,
"rewards/margins": 9.11315631866455,
"rewards/rejected": -3.334355115890503,
"step": 940
},
{
"epoch": 0.33837816403410215,
"grad_norm": 3.582839489017987,
"learning_rate": 4.6560133765943006e-06,
"logits/chosen": -1.3752954006195068,
"logits/rejected": -0.9404975771903992,
"logps/chosen": -135.32681274414062,
"logps/rejected": -372.66534423828125,
"loss": 0.9837,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.729502201080322,
"rewards/margins": 9.091387748718262,
"rewards/rejected": -3.3618857860565186,
"step": 960
},
{
"epoch": 0.34542770911814596,
"grad_norm": 3.5417976857081284,
"learning_rate": 4.641872300421108e-06,
"logits/chosen": -1.3784650564193726,
"logits/rejected": -0.8944051861763,
"logps/chosen": -143.4613494873047,
"logps/rejected": -375.3058776855469,
"loss": 1.025,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.82838773727417,
"rewards/margins": 9.216733932495117,
"rewards/rejected": -3.388345718383789,
"step": 980
},
{
"epoch": 0.3524772542021898,
"grad_norm": 3.4196142612670313,
"learning_rate": 4.62746876365675e-06,
"logits/chosen": -1.3793405294418335,
"logits/rejected": -0.8886861801147461,
"logps/chosen": -137.5000762939453,
"logps/rejected": -377.03350830078125,
"loss": 0.9952,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.801843166351318,
"rewards/margins": 9.207419395446777,
"rewards/rejected": -3.4055771827697754,
"step": 1000
},
{
"epoch": 0.35952679928623354,
"grad_norm": 3.7855239785429533,
"learning_rate": 4.61280453128076e-06,
"logits/chosen": -1.372959852218628,
"logits/rejected": -0.9385848045349121,
"logps/chosen": -138.4940948486328,
"logps/rejected": -368.65045166015625,
"loss": 1.0047,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.794959545135498,
"rewards/margins": 9.116719245910645,
"rewards/rejected": -3.3217601776123047,
"step": 1020
},
{
"epoch": 0.36657634437027736,
"grad_norm": 3.540584082558615,
"learning_rate": 4.597881400217773e-06,
"logits/chosen": -1.3944286108016968,
"logits/rejected": -1.0817062854766846,
"logps/chosen": -135.97718811035156,
"logps/rejected": -361.6622009277344,
"loss": 0.9877,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.732675075531006,
"rewards/margins": 8.984468460083008,
"rewards/rejected": -3.2517929077148438,
"step": 1040
},
{
"epoch": 0.3736258894543211,
"grad_norm": 3.587308477754233,
"learning_rate": 4.5827011991173284e-06,
"logits/chosen": -1.3007700443267822,
"logits/rejected": -1.0200260877609253,
"logps/chosen": -137.0142364501953,
"logps/rejected": -360.9063415527344,
"loss": 0.9879,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.7438249588012695,
"rewards/margins": 8.988321304321289,
"rewards/rejected": -3.2444958686828613,
"step": 1060
},
{
"epoch": 0.38067543453836494,
"grad_norm": 3.4394832652194562,
"learning_rate": 4.5672657881298e-06,
"logits/chosen": -1.3347376585006714,
"logits/rejected": -1.0313987731933594,
"logps/chosen": -137.21844482421875,
"logps/rejected": -358.77288818359375,
"loss": 0.9891,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.776368141174316,
"rewards/margins": 8.999374389648438,
"rewards/rejected": -3.2230064868927,
"step": 1080
},
{
"epoch": 0.38772497962240876,
"grad_norm": 3.7326559968101463,
"learning_rate": 4.551577058678447e-06,
"logits/chosen": -1.3314663171768188,
"logits/rejected": -0.9995508193969727,
"logps/chosen": -133.8922576904297,
"logps/rejected": -362.88214111328125,
"loss": 0.9669,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.811877250671387,
"rewards/margins": 9.075952529907227,
"rewards/rejected": -3.2640750408172607,
"step": 1100
},
{
"epoch": 0.3947745247064525,
"grad_norm": 3.4767646455734447,
"learning_rate": 4.535636933227651e-06,
"logits/chosen": -1.2618310451507568,
"logits/rejected": -0.8865704536437988,
"logps/chosen": -136.52227783203125,
"logps/rejected": -366.13525390625,
"loss": 0.9855,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.7819976806640625,
"rewards/margins": 9.078688621520996,
"rewards/rejected": -3.2966911792755127,
"step": 1120
},
{
"epoch": 0.40182406979049634,
"grad_norm": 3.660942268417341,
"learning_rate": 4.519447365047341e-06,
"logits/chosen": -1.2751779556274414,
"logits/rejected": -0.9009385108947754,
"logps/chosen": -135.4525909423828,
"logps/rejected": -369.0229187011719,
"loss": 0.9765,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.787187099456787,
"rewards/margins": 9.112687110900879,
"rewards/rejected": -3.3254997730255127,
"step": 1140
},
{
"epoch": 0.4088736148745401,
"grad_norm": 3.6419994005758127,
"learning_rate": 4.5030103379736335e-06,
"logits/chosen": -1.2887815237045288,
"logits/rejected": -0.9329848289489746,
"logps/chosen": -135.89544677734375,
"logps/rejected": -370.81414794921875,
"loss": 0.9764,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.7939133644104,
"rewards/margins": 9.13733959197998,
"rewards/rejected": -3.3434274196624756,
"step": 1160
},
{
"epoch": 0.4159231599585839,
"grad_norm": 3.4187952865950453,
"learning_rate": 4.486327866165751e-06,
"logits/chosen": -1.2444560527801514,
"logits/rejected": -0.9014924168586731,
"logps/chosen": -136.8031768798828,
"logps/rejected": -373.68890380859375,
"loss": 0.983,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.815255641937256,
"rewards/margins": 9.18748664855957,
"rewards/rejected": -3.3722312450408936,
"step": 1180
},
{
"epoch": 0.42297270504262774,
"grad_norm": 3.5442759882748103,
"learning_rate": 4.469401993859201e-06,
"logits/chosen": -1.262442946434021,
"logits/rejected": -0.8692361116409302,
"logps/chosen": -132.19586181640625,
"logps/rejected": -375.8874206542969,
"loss": 0.9612,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.739358425140381,
"rewards/margins": 9.133481979370117,
"rewards/rejected": -3.394123077392578,
"step": 1200
},
{
"epoch": 0.4300222501266715,
"grad_norm": 3.917612027189787,
"learning_rate": 4.4522347951152876e-06,
"logits/chosen": -1.2577459812164307,
"logits/rejected": -0.8758344650268555,
"logps/chosen": -132.18719482421875,
"logps/rejected": -378.2060241699219,
"loss": 0.9635,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.746466636657715,
"rewards/margins": 9.163701057434082,
"rewards/rejected": -3.417234420776367,
"step": 1220
},
{
"epoch": 0.4370717952107153,
"grad_norm": 3.3976029680607365,
"learning_rate": 4.434828373566952e-06,
"logits/chosen": -1.2668578624725342,
"logits/rejected": -0.8984266519546509,
"logps/chosen": -134.82275390625,
"logps/rejected": -380.01171875,
"loss": 0.9756,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.738933086395264,
"rewards/margins": 9.174293518066406,
"rewards/rejected": -3.4353606700897217,
"step": 1240
},
{
"epoch": 0.4441213402947591,
"grad_norm": 3.4516443155163783,
"learning_rate": 4.417184862161005e-06,
"logits/chosen": -1.2303470373153687,
"logits/rejected": -0.872378945350647,
"logps/chosen": -141.74575805664062,
"logps/rejected": -381.9054870605469,
"loss": 1.007,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.898972988128662,
"rewards/margins": 9.353425979614258,
"rewards/rejected": -3.4544525146484375,
"step": 1260
},
{
"epoch": 0.4511708853788029,
"grad_norm": 3.4252391290463713,
"learning_rate": 4.399306422896755e-06,
"logits/chosen": -1.1987241506576538,
"logits/rejected": -0.8535988926887512,
"logps/chosen": -137.49851989746094,
"logps/rejected": -383.2096252441406,
"loss": 0.9777,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.86844539642334,
"rewards/margins": 9.335935592651367,
"rewards/rejected": -3.4674899578094482,
"step": 1280
},
{
"epoch": 0.4582204304628467,
"grad_norm": 3.545736837903789,
"learning_rate": 4.3811952465610835e-06,
"logits/chosen": -1.2155053615570068,
"logits/rejected": -0.899692714214325,
"logps/chosen": -129.8439178466797,
"logps/rejected": -383.0055847167969,
"loss": 0.9496,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.7504801750183105,
"rewards/margins": 9.215784072875977,
"rewards/rejected": -3.465303421020508,
"step": 1300
},
{
"epoch": 0.4652699755468905,
"grad_norm": 3.447372993429395,
"learning_rate": 4.362853552459992e-06,
"logits/chosen": -1.2124435901641846,
"logits/rejected": -0.9287330508232117,
"logps/chosen": -130.0380096435547,
"logps/rejected": -381.68341064453125,
"loss": 0.9461,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.767488479614258,
"rewards/margins": 9.219490051269531,
"rewards/rejected": -3.4520013332366943,
"step": 1320
},
{
"epoch": 0.4723195206309343,
"grad_norm": 3.623096464653975,
"learning_rate": 4.344283588146648e-06,
"logits/chosen": -1.190288782119751,
"logits/rejected": -0.8485528826713562,
"logps/chosen": -130.83909606933594,
"logps/rejected": -384.2164001464844,
"loss": 0.9506,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.791656017303467,
"rewards/margins": 9.269119262695312,
"rewards/rejected": -3.477463483810425,
"step": 1340
},
{
"epoch": 0.47936906571497806,
"grad_norm": 3.560577638508316,
"learning_rate": 4.3254876291459805e-06,
"logits/chosen": -1.1886543035507202,
"logits/rejected": -0.8671566247940063,
"logps/chosen": -136.43968200683594,
"logps/rejected": -386.629638671875,
"loss": 0.9836,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.842299461364746,
"rewards/margins": 9.343871116638184,
"rewards/rejected": -3.5015721321105957,
"step": 1360
},
{
"epoch": 0.4864186107990219,
"grad_norm": 3.1473460328784997,
"learning_rate": 4.3064679786758364e-06,
"logits/chosen": -1.2196760177612305,
"logits/rejected": -0.8831031918525696,
"logps/chosen": -136.54266357421875,
"logps/rejected": -388.2689208984375,
"loss": 0.9732,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.864408016204834,
"rewards/margins": 9.382452964782715,
"rewards/rejected": -3.51804518699646,
"step": 1380
},
{
"epoch": 0.4934681558830657,
"grad_norm": 3.4991605727429156,
"learning_rate": 4.287226967364755e-06,
"logits/chosen": -1.1478255987167358,
"logits/rejected": -0.8476384282112122,
"logps/chosen": -135.9375457763672,
"logps/rejected": -390.1582946777344,
"loss": 0.9744,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.896569728851318,
"rewards/margins": 9.433442115783691,
"rewards/rejected": -3.536872625350952,
"step": 1400
},
{
"epoch": 0.5005177009671095,
"grad_norm": 3.307262546072772,
"learning_rate": 4.267766952966369e-06,
"logits/chosen": -1.1719753742218018,
"logits/rejected": -0.8473686575889587,
"logps/chosen": -132.99285888671875,
"logps/rejected": -390.5616149902344,
"loss": 0.9544,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.861688613891602,
"rewards/margins": 9.402670860290527,
"rewards/rejected": -3.5409820079803467,
"step": 1420
},
{
"epoch": 0.5075672460511532,
"grad_norm": 3.418974654141689,
"learning_rate": 4.248090320070501e-06,
"logits/chosen": -1.1494816541671753,
"logits/rejected": -0.8639947772026062,
"logps/chosen": -130.6112060546875,
"logps/rejected": -389.1405334472656,
"loss": 0.9413,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.852599620819092,
"rewards/margins": 9.379307746887207,
"rewards/rejected": -3.5267083644866943,
"step": 1440
},
{
"epoch": 0.5146167911351971,
"grad_norm": 3.641099772646816,
"learning_rate": 4.22819947981095e-06,
"logits/chosen": -1.2039188146591187,
"logits/rejected": -0.8875476717948914,
"logps/chosen": -133.9625701904297,
"logps/rejected": -394.1423034667969,
"loss": 0.9624,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.86251974105835,
"rewards/margins": 9.439249038696289,
"rewards/rejected": -3.5767295360565186,
"step": 1460
},
{
"epoch": 0.5216663362192409,
"grad_norm": 3.5377427783220856,
"learning_rate": 4.208096869570046e-06,
"logits/chosen": -1.1980522871017456,
"logits/rejected": -0.8648649454116821,
"logps/chosen": -133.25439453125,
"logps/rejected": -394.1308288574219,
"loss": 0.9571,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.7785868644714355,
"rewards/margins": 9.355122566223145,
"rewards/rejected": -3.576535940170288,
"step": 1480
},
{
"epoch": 0.5287158813032846,
"grad_norm": 3.572038643146727,
"learning_rate": 4.1877849526799705e-06,
"logits/chosen": -1.1758784055709839,
"logits/rejected": -0.8390051126480103,
"logps/chosen": -130.81890869140625,
"logps/rejected": -396.5048522949219,
"loss": 0.9403,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.8407206535339355,
"rewards/margins": 9.441041946411133,
"rewards/rejected": -3.60032057762146,
"step": 1500
},
{
"epoch": 0.5357654263873285,
"grad_norm": 3.4947337389011577,
"learning_rate": 4.167266218120907e-06,
"logits/chosen": -1.1654809713363647,
"logits/rejected": -0.8482118844985962,
"logps/chosen": -129.94468688964844,
"logps/rejected": -397.8246765136719,
"loss": 0.93,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.867563724517822,
"rewards/margins": 9.481162071228027,
"rewards/rejected": -3.613598585128784,
"step": 1520
},
{
"epoch": 0.5428149714713723,
"grad_norm": 3.42502470705207,
"learning_rate": 4.146543180216051e-06,
"logits/chosen": -1.1724631786346436,
"logits/rejected": -0.8869396448135376,
"logps/chosen": -132.4146270751953,
"logps/rejected": -398.17681884765625,
"loss": 0.954,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.82112979888916,
"rewards/margins": 9.438194274902344,
"rewards/rejected": -3.6170647144317627,
"step": 1540
},
{
"epoch": 0.549864516555416,
"grad_norm": 3.4338454588217493,
"learning_rate": 4.125618378323503e-06,
"logits/chosen": -1.2047984600067139,
"logits/rejected": -0.8852383494377136,
"logps/chosen": -125.2190933227539,
"logps/rejected": -398.3727722167969,
"loss": 0.9137,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.7483320236206055,
"rewards/margins": 9.367304801940918,
"rewards/rejected": -3.6189732551574707,
"step": 1560
},
{
"epoch": 0.5569140616394598,
"grad_norm": 3.476811547379371,
"learning_rate": 4.104494376525106e-06,
"logits/chosen": -1.2003843784332275,
"logits/rejected": -0.8351160883903503,
"logps/chosen": -127.03767395019531,
"logps/rejected": -399.4728088378906,
"loss": 0.9273,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.7366766929626465,
"rewards/margins": 9.36662769317627,
"rewards/rejected": -3.6299514770507812,
"step": 1580
},
{
"epoch": 0.5639636067235037,
"grad_norm": 3.383144084845009,
"learning_rate": 4.083173763312248e-06,
"logits/chosen": -1.142123818397522,
"logits/rejected": -0.8299978375434875,
"logps/chosen": -132.69578552246094,
"logps/rejected": -402.67486572265625,
"loss": 0.9548,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.840543746948242,
"rewards/margins": 9.502639770507812,
"rewards/rejected": -3.662095785140991,
"step": 1600
},
{
"epoch": 0.5710131518075474,
"grad_norm": 3.4145198061570103,
"learning_rate": 4.061659151268668e-06,
"logits/chosen": -1.1268980503082275,
"logits/rejected": -0.8165463805198669,
"logps/chosen": -130.29315185546875,
"logps/rejected": -404.2532043457031,
"loss": 0.9449,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.816205024719238,
"rewards/margins": 9.493982315063477,
"rewards/rejected": -3.677777051925659,
"step": 1620
},
{
"epoch": 0.5780626968915912,
"grad_norm": 3.2632269319301845,
"learning_rate": 4.039953176750321e-06,
"logits/chosen": -1.123947024345398,
"logits/rejected": -0.7603979706764221,
"logps/chosen": -129.68569946289062,
"logps/rejected": -404.5145568847656,
"loss": 0.9394,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.798472881317139,
"rewards/margins": 9.478851318359375,
"rewards/rejected": -3.6803791522979736,
"step": 1640
},
{
"epoch": 0.585112241975635,
"grad_norm": 3.413795992881312,
"learning_rate": 4.018058499562326e-06,
"logits/chosen": -1.1000627279281616,
"logits/rejected": -0.7446752786636353,
"logps/chosen": -134.6430206298828,
"logps/rejected": -405.8460998535156,
"loss": 0.9556,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.897136211395264,
"rewards/margins": 9.590985298156738,
"rewards/rejected": -3.6938483715057373,
"step": 1660
},
{
"epoch": 0.5921617870596788,
"grad_norm": 3.362057771016843,
"learning_rate": 3.995977802633032e-06,
"logits/chosen": -1.0729659795761108,
"logits/rejected": -0.7292208075523376,
"logps/chosen": -131.8883056640625,
"logps/rejected": -407.1473083496094,
"loss": 0.9444,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.883275985717773,
"rewards/margins": 9.590205192565918,
"rewards/rejected": -3.706928253173828,
"step": 1680
},
{
"epoch": 0.5992113321437226,
"grad_norm": 3.428303427953224,
"learning_rate": 3.973713791685263e-06,
"logits/chosen": -1.0722216367721558,
"logits/rejected": -0.7673947215080261,
"logps/chosen": -132.4364471435547,
"logps/rejected": -407.9749755859375,
"loss": 0.9449,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.872178554534912,
"rewards/margins": 9.58730411529541,
"rewards/rejected": -3.7151248455047607,
"step": 1700
},
{
"epoch": 0.6062608772277664,
"grad_norm": 4.027806823030179,
"learning_rate": 3.951269194904765e-06,
"logits/chosen": -1.0447555780410767,
"logits/rejected": -0.7338883280754089,
"logps/chosen": -129.0581817626953,
"logps/rejected": -409.84765625,
"loss": 0.9372,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.825490951538086,
"rewards/margins": 9.559223175048828,
"rewards/rejected": -3.7337327003479004,
"step": 1720
},
{
"epoch": 0.6133104223118102,
"grad_norm": 3.5067474676414108,
"learning_rate": 3.928646762605892e-06,
"logits/chosen": -1.0310039520263672,
"logits/rejected": -0.7086827158927917,
"logps/chosen": -130.6362762451172,
"logps/rejected": -411.41943359375,
"loss": 0.9471,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.827083587646484,
"rewards/margins": 9.576539993286133,
"rewards/rejected": -3.7494568824768066,
"step": 1740
},
{
"epoch": 0.620359967395854,
"grad_norm": 3.4568451567973306,
"learning_rate": 3.9058492668945995e-06,
"logits/chosen": -1.0341156721115112,
"logits/rejected": -0.6810585856437683,
"logps/chosen": -132.6650848388672,
"logps/rejected": -412.41943359375,
"loss": 0.9477,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.893308162689209,
"rewards/margins": 9.652871131896973,
"rewards/rejected": -3.759563446044922,
"step": 1760
},
{
"epoch": 0.6274095124798977,
"grad_norm": 3.6505774836083535,
"learning_rate": 3.882879501328747e-06,
"logits/chosen": -1.0008310079574585,
"logits/rejected": -0.6594039797782898,
"logps/chosen": -129.55776977539062,
"logps/rejected": -414.1068420410156,
"loss": 0.9315,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.86118221282959,
"rewards/margins": 9.637504577636719,
"rewards/rejected": -3.776322603225708,
"step": 1780
},
{
"epoch": 0.6344590575639416,
"grad_norm": 3.5805353661928248,
"learning_rate": 3.859740280575787e-06,
"logits/chosen": -0.9931684732437134,
"logits/rejected": -0.6309129595756531,
"logps/chosen": -130.759033203125,
"logps/rejected": -415.9930725097656,
"loss": 0.9437,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.825643539428711,
"rewards/margins": 9.62082576751709,
"rewards/rejected": -3.7951819896698,
"step": 1800
},
{
"epoch": 0.6415086026479854,
"grad_norm": 3.4415482352225326,
"learning_rate": 3.836434440067858e-06,
"logits/chosen": -0.9874204993247986,
"logits/rejected": -0.6279218792915344,
"logps/chosen": -129.0675811767578,
"logps/rejected": -417.6011657714844,
"loss": 0.9333,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.867391109466553,
"rewards/margins": 9.678707122802734,
"rewards/rejected": -3.8113160133361816,
"step": 1820
},
{
"epoch": 0.6485581477320291,
"grad_norm": 3.4807858238324023,
"learning_rate": 3.8129648356543382e-06,
"logits/chosen": -1.0203678607940674,
"logits/rejected": -0.6446800827980042,
"logps/chosen": -127.60528564453125,
"logps/rejected": -418.719482421875,
"loss": 0.9229,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.817119121551514,
"rewards/margins": 9.639582633972168,
"rewards/rejected": -3.8224639892578125,
"step": 1840
},
{
"epoch": 0.655607692816073,
"grad_norm": 3.3116942159563876,
"learning_rate": 3.789334343251895e-06,
"logits/chosen": -1.0190130472183228,
"logits/rejected": -0.6429179906845093,
"logps/chosen": -128.70944213867188,
"logps/rejected": -420.0308532714844,
"loss": 0.9263,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.867619037628174,
"rewards/margins": 9.703242301940918,
"rewards/rejected": -3.8356235027313232,
"step": 1860
},
{
"epoch": 0.6626572379001168,
"grad_norm": 3.4172689324940664,
"learning_rate": 3.765545858492077e-06,
"logits/chosen": -0.9674955606460571,
"logits/rejected": -0.6231255531311035,
"logps/chosen": -128.2279052734375,
"logps/rejected": -420.6991271972656,
"loss": 0.9211,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.826233863830566,
"rewards/margins": 9.668456077575684,
"rewards/rejected": -3.8422226905822754,
"step": 1880
},
{
"epoch": 0.6697067829841605,
"grad_norm": 3.62022832891231,
"learning_rate": 3.741602296366487e-06,
"logits/chosen": -0.9731711745262146,
"logits/rejected": -0.622948169708252,
"logps/chosen": -127.91731262207031,
"logps/rejected": -420.6474609375,
"loss": 0.9201,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.898708820343018,
"rewards/margins": 9.740521430969238,
"rewards/rejected": -3.8418128490448,
"step": 1900
},
{
"epoch": 0.6767563280682043,
"grad_norm": 3.75244615174725,
"learning_rate": 3.7175065908695907e-06,
"logits/chosen": -0.9623433351516724,
"logits/rejected": -0.611689031124115,
"logps/chosen": -127.69451904296875,
"logps/rejected": -421.1045837402344,
"loss": 0.9245,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.853524684906006,
"rewards/margins": 9.699825286865234,
"rewards/rejected": -3.846301317214966,
"step": 1920
},
{
"epoch": 0.6838058731522482,
"grad_norm": 3.4336011780072306,
"learning_rate": 3.6932616946391825e-06,
"logits/chosen": -0.9564552307128906,
"logits/rejected": -0.5640252232551575,
"logps/chosen": -128.86248779296875,
"logps/rejected": -422.3670349121094,
"loss": 0.9265,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.866178035736084,
"rewards/margins": 9.725229263305664,
"rewards/rejected": -3.859050750732422,
"step": 1940
},
{
"epoch": 0.6908554182362919,
"grad_norm": 3.346062760434833,
"learning_rate": 3.6688705785945828e-06,
"logits/chosen": -0.9649378657341003,
"logits/rejected": -0.5636154413223267,
"logps/chosen": -128.705322265625,
"logps/rejected": -422.5312194824219,
"loss": 0.9322,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.877577304840088,
"rewards/margins": 9.738177299499512,
"rewards/rejected": -3.8605995178222656,
"step": 1960
},
{
"epoch": 0.6979049633203357,
"grad_norm": 3.562698291523997,
"learning_rate": 3.644336231572584e-06,
"logits/chosen": -1.0158581733703613,
"logits/rejected": -0.5938189625740051,
"logps/chosen": -127.7231674194336,
"logps/rejected": -423.938232421875,
"loss": 0.9313,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.831978797912598,
"rewards/margins": 9.706572532653809,
"rewards/rejected": -3.874593734741211,
"step": 1980
},
{
"epoch": 0.7049545084043796,
"grad_norm": 3.871831303710938,
"learning_rate": 3.6196616599612043e-06,
"logits/chosen": -0.9945854544639587,
"logits/rejected": -0.5935266613960266,
"logps/chosen": -131.4221649169922,
"logps/rejected": -425.1352233886719,
"loss": 0.9457,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.880837917327881,
"rewards/margins": 9.767516136169434,
"rewards/rejected": -3.8866775035858154,
"step": 2000
},
{
"epoch": 0.7120040534884233,
"grad_norm": 3.525675978075724,
"learning_rate": 3.5948498873312963e-06,
"logits/chosen": -0.9815523028373718,
"logits/rejected": -0.6091595888137817,
"logps/chosen": -125.77652740478516,
"logps/rejected": -425.8006896972656,
"loss": 0.914,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.811987400054932,
"rewards/margins": 9.705245971679688,
"rewards/rejected": -3.893258810043335,
"step": 2020
},
{
"epoch": 0.7190535985724671,
"grad_norm": 3.44484661738424,
"learning_rate": 3.5699039540660364e-06,
"logits/chosen": -0.9921701550483704,
"logits/rejected": -0.6261746883392334,
"logps/chosen": -127.05594635009766,
"logps/rejected": -426.45538330078125,
"loss": 0.9157,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.895692348480225,
"rewards/margins": 9.795554161071777,
"rewards/rejected": -3.8998615741729736,
"step": 2040
},
{
"epoch": 0.726103143656511,
"grad_norm": 3.4745813514606203,
"learning_rate": 3.5448269169883686e-06,
"logits/chosen": -0.9629077911376953,
"logits/rejected": -0.562868058681488,
"logps/chosen": -125.16448211669922,
"logps/rejected": -427.8833923339844,
"loss": 0.896,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.892127990722656,
"rewards/margins": 9.806340217590332,
"rewards/rejected": -3.9142112731933594,
"step": 2060
},
{
"epoch": 0.7331526887405547,
"grad_norm": 3.424852141033239,
"learning_rate": 3.519621848986428e-06,
"logits/chosen": -0.9507054686546326,
"logits/rejected": -0.5376430749893188,
"logps/chosen": -128.2971954345703,
"logps/rejected": -428.9288024902344,
"loss": 0.9258,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.92517614364624,
"rewards/margins": 9.849757194519043,
"rewards/rejected": -3.9245803356170654,
"step": 2080
},
{
"epoch": 0.7402022338245985,
"grad_norm": 3.5251677265369543,
"learning_rate": 3.4942918386369916e-06,
"logits/chosen": -0.9696516394615173,
"logits/rejected": -0.5560165643692017,
"logps/chosen": -128.28604125976562,
"logps/rejected": -429.8753967285156,
"loss": 0.923,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.882317543029785,
"rewards/margins": 9.816309928894043,
"rewards/rejected": -3.9339921474456787,
"step": 2100
},
{
"epoch": 0.7472517789086423,
"grad_norm": 3.4170327558831333,
"learning_rate": 3.468839989827014e-06,
"logits/chosen": -0.9457041025161743,
"logits/rejected": -0.5628274083137512,
"logps/chosen": -127.13785552978516,
"logps/rejected": -429.71923828125,
"loss": 0.9242,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.836861610412598,
"rewards/margins": 9.769371032714844,
"rewards/rejected": -3.932508945465088,
"step": 2120
},
{
"epoch": 0.7543013239926861,
"grad_norm": 3.6140577579219695,
"learning_rate": 3.443269421373282e-06,
"logits/chosen": -0.9552961587905884,
"logits/rejected": -0.5585734844207764,
"logps/chosen": -125.71052551269531,
"logps/rejected": -431.2530822753906,
"loss": 0.9171,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.742475509643555,
"rewards/margins": 9.690230369567871,
"rewards/rejected": -3.9477555751800537,
"step": 2140
},
{
"epoch": 0.7613508690767299,
"grad_norm": 3.3716609559874087,
"learning_rate": 3.41758326664024e-06,
"logits/chosen": -0.9388333559036255,
"logits/rejected": -0.5697802901268005,
"logps/chosen": -125.131103515625,
"logps/rejected": -432.1859436035156,
"loss": 0.9039,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.860114097595215,
"rewards/margins": 9.817200660705566,
"rewards/rejected": -3.9570865631103516,
"step": 2160
},
{
"epoch": 0.7684004141607736,
"grad_norm": 3.647585572092197,
"learning_rate": 3.391784673156038e-06,
"logits/chosen": -0.9397434592247009,
"logits/rejected": -0.5741142630577087,
"logps/chosen": -129.66160583496094,
"logps/rejected": -433.1015319824219,
"loss": 0.924,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.916826248168945,
"rewards/margins": 9.883145332336426,
"rewards/rejected": -3.966318130493164,
"step": 2180
},
{
"epoch": 0.7754499592448175,
"grad_norm": 3.2724520562704438,
"learning_rate": 3.365876802226833e-06,
"logits/chosen": -0.9321144223213196,
"logits/rejected": -0.5627609491348267,
"logps/chosen": -124.29692840576172,
"logps/rejected": -433.846923828125,
"loss": 0.8999,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.887063026428223,
"rewards/margins": 9.86075496673584,
"rewards/rejected": -3.9736931324005127,
"step": 2200
},
{
"epoch": 0.7824995043288613,
"grad_norm": 3.458299468379887,
"learning_rate": 3.3398628285494193e-06,
"logits/chosen": -0.9217039346694946,
"logits/rejected": -0.5298233032226562,
"logps/chosen": -126.9094009399414,
"logps/rejected": -434.9250183105469,
"loss": 0.9202,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.8385796546936035,
"rewards/margins": 9.823028564453125,
"rewards/rejected": -3.9844493865966797,
"step": 2220
},
{
"epoch": 0.789549049412905,
"grad_norm": 3.5226619493219804,
"learning_rate": 3.3137459398221984e-06,
"logits/chosen": -0.8872036337852478,
"logits/rejected": -0.498056560754776,
"logps/chosen": -127.6511459350586,
"logps/rejected": -435.1647033691406,
"loss": 0.9222,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.896953105926514,
"rewards/margins": 9.883868217468262,
"rewards/rejected": -3.9869160652160645,
"step": 2240
},
{
"epoch": 0.7965985944969489,
"grad_norm": 3.5412559165567745,
"learning_rate": 3.2875293363545692e-06,
"logits/chosen": -0.876203715801239,
"logits/rejected": -0.49807339906692505,
"logps/chosen": -128.12411499023438,
"logps/rejected": -435.5065002441406,
"loss": 0.9134,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.977075576782227,
"rewards/margins": 9.967486381530762,
"rewards/rejected": -3.9904098510742188,
"step": 2260
},
{
"epoch": 0.8036481395809927,
"grad_norm": 3.620961758496946,
"learning_rate": 3.261216230674768e-06,
"logits/chosen": -0.8721205592155457,
"logits/rejected": -0.46372947096824646,
"logps/chosen": -127.4286880493164,
"logps/rejected": -436.84674072265625,
"loss": 0.9183,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.863312244415283,
"rewards/margins": 9.866989135742188,
"rewards/rejected": -4.0036773681640625,
"step": 2280
},
{
"epoch": 0.8106976846650364,
"grad_norm": 3.4592850588257718,
"learning_rate": 3.2348098471362132e-06,
"logits/chosen": -0.8742119073867798,
"logits/rejected": -0.4837910234928131,
"logps/chosen": -128.3339080810547,
"logps/rejected": -437.7392578125,
"loss": 0.9189,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.9210028648376465,
"rewards/margins": 9.933659553527832,
"rewards/rejected": -4.0126566886901855,
"step": 2300
},
{
"epoch": 0.8177472297490802,
"grad_norm": 3.5572863657837153,
"learning_rate": 3.208313421522397e-06,
"logits/chosen": -0.8337503671646118,
"logits/rejected": -0.47826629877090454,
"logps/chosen": -127.3692398071289,
"logps/rejected": -437.60162353515625,
"loss": 0.9134,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.904942512512207,
"rewards/margins": 9.916193962097168,
"rewards/rejected": -4.011251449584961,
"step": 2320
},
{
"epoch": 0.8247967748331241,
"grad_norm": 3.8162541172864333,
"learning_rate": 3.1817302006503835e-06,
"logits/chosen": -0.8695448040962219,
"logits/rejected": -0.5193209052085876,
"logps/chosen": -127.39215087890625,
"logps/rejected": -438.3293151855469,
"loss": 0.9169,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.902700901031494,
"rewards/margins": 9.921298027038574,
"rewards/rejected": -4.018597602844238,
"step": 2340
},
{
"epoch": 0.8318463199171678,
"grad_norm": 3.339934252923497,
"learning_rate": 3.1550634419729443e-06,
"logits/chosen": -0.9015306830406189,
"logits/rejected": -0.533819854259491,
"logps/chosen": -127.2344970703125,
"logps/rejected": -438.45684814453125,
"loss": 0.9114,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.944534778594971,
"rewards/margins": 9.964533805847168,
"rewards/rejected": -4.019999980926514,
"step": 2360
},
{
"epoch": 0.8388958650012116,
"grad_norm": 3.271467539577929,
"learning_rate": 3.128316413179403e-06,
"logits/chosen": -0.8996972441673279,
"logits/rejected": -0.48372983932495117,
"logps/chosen": -132.72434997558594,
"logps/rejected": -438.97900390625,
"loss": 0.9392,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.998867988586426,
"rewards/margins": 10.023943901062012,
"rewards/rejected": -4.025076389312744,
"step": 2380
},
{
"epoch": 0.8459454100852555,
"grad_norm": 3.347650161642198,
"learning_rate": 3.101492391795215e-06,
"logits/chosen": -0.923694908618927,
"logits/rejected": -0.521001935005188,
"logps/chosen": -124.9764633178711,
"logps/rejected": -439.68194580078125,
"loss": 0.9067,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.902924537658691,
"rewards/margins": 9.934977531433105,
"rewards/rejected": -4.032052516937256,
"step": 2400
},
{
"epoch": 0.8529949551692992,
"grad_norm": 3.5949752114784594,
"learning_rate": 3.0745946647803506e-06,
"logits/chosen": -0.8853136301040649,
"logits/rejected": -0.5156723260879517,
"logps/chosen": -127.0760726928711,
"logps/rejected": -440.3129577636719,
"loss": 0.9012,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.988919258117676,
"rewards/margins": 10.027365684509277,
"rewards/rejected": -4.038445949554443,
"step": 2420
},
{
"epoch": 0.860044500253343,
"grad_norm": 3.4036399165146873,
"learning_rate": 3.0476265281265136e-06,
"logits/chosen": -0.8544861078262329,
"logits/rejected": -0.5153040885925293,
"logps/chosen": -126.94913482666016,
"logps/rejected": -440.95733642578125,
"loss": 0.9124,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.942080020904541,
"rewards/margins": 9.986968040466309,
"rewards/rejected": -4.044888496398926,
"step": 2440
},
{
"epoch": 0.8670940453373869,
"grad_norm": 3.441908586206191,
"learning_rate": 3.0205912864532582e-06,
"logits/chosen": -0.8383521437644958,
"logits/rejected": -0.4816366732120514,
"logps/chosen": -118.70378875732422,
"logps/rejected": -442.1132507324219,
"loss": 0.8588,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.885745048522949,
"rewards/margins": 9.942169189453125,
"rewards/rejected": -4.056425094604492,
"step": 2460
},
{
"epoch": 0.8741435904214306,
"grad_norm": 3.4985409509349084,
"learning_rate": 2.9934922526030507e-06,
"logits/chosen": -0.8542296290397644,
"logits/rejected": -0.4874165654182434,
"logps/chosen": -126.6626205444336,
"logps/rejected": -442.2069396972656,
"loss": 0.9089,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.916454315185547,
"rewards/margins": 9.973841667175293,
"rewards/rejected": -4.057387351989746,
"step": 2480
},
{
"epoch": 0.8811931355054744,
"grad_norm": 3.498245870729679,
"learning_rate": 2.966332747235318e-06,
"logits/chosen": -0.8780538439750671,
"logits/rejected": -0.4646807610988617,
"logps/chosen": -128.723876953125,
"logps/rejected": -442.9123229980469,
"loss": 0.9113,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.958227634429932,
"rewards/margins": 10.022747993469238,
"rewards/rejected": -4.064519882202148,
"step": 2500
},
{
"epoch": 0.8882426805895182,
"grad_norm": 3.591985658151727,
"learning_rate": 2.939116098419538e-06,
"logits/chosen": -0.8663986325263977,
"logits/rejected": -0.4639105796813965,
"logps/chosen": -128.5757293701172,
"logps/rejected": -443.9267578125,
"loss": 0.9148,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.976249694824219,
"rewards/margins": 10.05082893371582,
"rewards/rejected": -4.074578762054443,
"step": 2520
},
{
"epoch": 0.895292225673562,
"grad_norm": 3.722094728579672,
"learning_rate": 2.9118456412274348e-06,
"logits/chosen": -0.8501833081245422,
"logits/rejected": -0.45762744545936584,
"logps/chosen": -125.53106689453125,
"logps/rejected": -444.51739501953125,
"loss": 0.9058,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.896867275238037,
"rewards/margins": 9.977302551269531,
"rewards/rejected": -4.080434322357178,
"step": 2540
},
{
"epoch": 0.9023417707576058,
"grad_norm": 3.2456180119701377,
"learning_rate": 2.8845247173242923e-06,
"logits/chosen": -0.8174735903739929,
"logits/rejected": -0.44560080766677856,
"logps/chosen": -123.5894546508789,
"logps/rejected": -445.1806640625,
"loss": 0.8893,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.879065990447998,
"rewards/margins": 9.966200828552246,
"rewards/rejected": -4.087134838104248,
"step": 2560
},
{
"epoch": 0.9093913158416496,
"grad_norm": 3.5653205016331357,
"learning_rate": 2.8571566745594857e-06,
"logits/chosen": -0.8129379153251648,
"logits/rejected": -0.45377278327941895,
"logps/chosen": -124.4063949584961,
"logps/rejected": -445.7228698730469,
"loss": 0.8938,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.926529407501221,
"rewards/margins": 10.0189790725708,
"rewards/rejected": -4.0924506187438965,
"step": 2580
},
{
"epoch": 0.9164408609256934,
"grad_norm": 3.5526191928837303,
"learning_rate": 2.829744866556236e-06,
"logits/chosen": -0.7916550636291504,
"logits/rejected": -0.40035343170166016,
"logps/chosen": -124.70146942138672,
"logps/rejected": -446.320068359375,
"loss": 0.8904,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.9350385665893555,
"rewards/margins": 10.033607482910156,
"rewards/rejected": -4.098568916320801,
"step": 2600
},
{
"epoch": 0.9234904060097372,
"grad_norm": 3.434227344310014,
"learning_rate": 2.8022926523006644e-06,
"logits/chosen": -0.8052730560302734,
"logits/rejected": -0.4015885293483734,
"logps/chosen": -124.60784912109375,
"logps/rejected": -446.3960876464844,
"loss": 0.8916,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.919919967651367,
"rewards/margins": 10.019259452819824,
"rewards/rejected": -4.099339008331299,
"step": 2620
},
{
"epoch": 0.930539951093781,
"grad_norm": 3.3926060029845986,
"learning_rate": 2.774803395730194e-06,
"logits/chosen": -0.8420242667198181,
"logits/rejected": -0.41102123260498047,
"logps/chosen": -126.0671615600586,
"logps/rejected": -446.8029479980469,
"loss": 0.9077,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.900039196014404,
"rewards/margins": 10.003410339355469,
"rewards/rejected": -4.103371620178223,
"step": 2640
},
{
"epoch": 0.9375894961778247,
"grad_norm": 3.357665650193962,
"learning_rate": 2.747280465321332e-06,
"logits/chosen": -0.8105039000511169,
"logits/rejected": -0.3818654417991638,
"logps/chosen": -126.90190887451172,
"logps/rejected": -447.68603515625,
"loss": 0.9119,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.911780834197998,
"rewards/margins": 10.023977279663086,
"rewards/rejected": -4.1121954917907715,
"step": 2660
},
{
"epoch": 0.9446390412618686,
"grad_norm": 3.32023322129936,
"learning_rate": 2.7197272336769114e-06,
"logits/chosen": -0.8306812644004822,
"logits/rejected": -0.38350504636764526,
"logps/chosen": -123.29646301269531,
"logps/rejected": -448.5968933105469,
"loss": 0.8882,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.89159631729126,
"rewards/margins": 10.012816429138184,
"rewards/rejected": -4.121219635009766,
"step": 2680
},
{
"epoch": 0.9516885863459124,
"grad_norm": 3.5798392363898803,
"learning_rate": 2.692147077112815e-06,
"logits/chosen": -0.8416573405265808,
"logits/rejected": -0.40649136900901794,
"logps/chosen": -128.4845428466797,
"logps/rejected": -448.6703186035156,
"loss": 0.9173,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.955367565155029,
"rewards/margins": 10.07739543914795,
"rewards/rejected": -4.122028350830078,
"step": 2700
},
{
"epoch": 0.9587381314299561,
"grad_norm": 3.5837421382411807,
"learning_rate": 2.6645433752442474e-06,
"logits/chosen": -0.8264873623847961,
"logits/rejected": -0.39921826124191284,
"logps/chosen": -123.0130386352539,
"logps/rejected": -449.3578796386719,
"loss": 0.8858,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.90243673324585,
"rewards/margins": 10.031229972839355,
"rewards/rejected": -4.128793239593506,
"step": 2720
},
{
"epoch": 0.965787676514,
"grad_norm": 3.436374551260621,
"learning_rate": 2.6369195105716087e-06,
"logits/chosen": -0.8044994473457336,
"logits/rejected": -0.3846818506717682,
"logps/chosen": -124.25041961669922,
"logps/rejected": -449.81298828125,
"loss": 0.8951,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.933631420135498,
"rewards/margins": 10.06702995300293,
"rewards/rejected": -4.133397579193115,
"step": 2740
},
{
"epoch": 0.9728372215980438,
"grad_norm": 3.3164716286577227,
"learning_rate": 2.609278868066007e-06,
"logits/chosen": -0.8319023251533508,
"logits/rejected": -0.4071156978607178,
"logps/chosen": -125.79058074951172,
"logps/rejected": -449.4647521972656,
"loss": 0.8959,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.004671573638916,
"rewards/margins": 10.134662628173828,
"rewards/rejected": -4.129991054534912,
"step": 2760
},
{
"epoch": 0.9798867666820875,
"grad_norm": 3.4420462182992106,
"learning_rate": 2.581624834754469e-06,
"logits/chosen": -0.7811191082000732,
"logits/rejected": -0.35583096742630005,
"logps/chosen": -125.7827377319336,
"logps/rejected": -448.8321228027344,
"loss": 0.9017,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.957380771636963,
"rewards/margins": 10.080957412719727,
"rewards/rejected": -4.123576641082764,
"step": 2780
},
{
"epoch": 0.9869363117661314,
"grad_norm": 3.4368002175698105,
"learning_rate": 2.553960799304905e-06,
"logits/chosen": -0.7721039056777954,
"logits/rejected": -0.35003602504730225,
"logps/chosen": -126.46416473388672,
"logps/rejected": -450.1924743652344,
"loss": 0.9023,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.950610637664795,
"rewards/margins": 10.08786392211914,
"rewards/rejected": -4.137254238128662,
"step": 2800
},
{
"epoch": 0.9939858568501752,
"grad_norm": 3.3094259982405445,
"learning_rate": 2.526290151610865e-06,
"logits/chosen": -0.7781057357788086,
"logits/rejected": -0.3507390320301056,
"logps/chosen": -123.1922836303711,
"logps/rejected": -450.4504089355469,
"loss": 0.8887,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.893508434295654,
"rewards/margins": 10.033327102661133,
"rewards/rejected": -4.13981819152832,
"step": 2820
},
{
"epoch": 1.0007049545084044,
"grad_norm": 3.3806657760520955,
"learning_rate": 2.4986162823761513e-06,
"logits/chosen": -0.8073826432228088,
"logits/rejected": -0.3557504415512085,
"logps/chosen": -121.75922393798828,
"logps/rejected": -450.6792907714844,
"loss": 0.8728,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.962116718292236,
"rewards/margins": 10.104215621948242,
"rewards/rejected": -4.1421003341674805,
"step": 2840
},
{
"epoch": 1.0077544995924481,
"grad_norm": 3.3516929443658294,
"learning_rate": 2.4709425826993274e-06,
"logits/chosen": -0.8178227543830872,
"logits/rejected": -0.3367989957332611,
"logps/chosen": -112.90401458740234,
"logps/rejected": -450.7651062011719,
"loss": 0.8123,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.030337810516357,
"rewards/margins": 10.173301696777344,
"rewards/rejected": -4.1429643630981445,
"step": 2860
},
{
"epoch": 1.0148040446764919,
"grad_norm": 3.4921890072506168,
"learning_rate": 2.443272443658177e-06,
"logits/chosen": -0.8200713992118835,
"logits/rejected": -0.3234314024448395,
"logps/chosen": -112.2830810546875,
"logps/rejected": -451.45068359375,
"loss": 0.8067,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.004191875457764,
"rewards/margins": 10.153976440429688,
"rewards/rejected": -4.149785041809082,
"step": 2880
},
{
"epoch": 1.0218535897605359,
"grad_norm": 3.589176883533258,
"learning_rate": 2.415609255894173e-06,
"logits/chosen": -0.8283817172050476,
"logits/rejected": -0.35207217931747437,
"logps/chosen": -116.6107177734375,
"logps/rejected": -452.73248291015625,
"loss": 0.8273,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.116900444030762,
"rewards/margins": 10.279582023620605,
"rewards/rejected": -4.16268253326416,
"step": 2900
},
{
"epoch": 1.0289031348445796,
"grad_norm": 3.3300561164770324,
"learning_rate": 2.3879564091969936e-06,
"logits/chosen": -0.7885460257530212,
"logits/rejected": -0.3300701081752777,
"logps/chosen": -111.32238006591797,
"logps/rejected": -452.65673828125,
"loss": 0.7952,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.027840614318848,
"rewards/margins": 10.189643859863281,
"rewards/rejected": -4.16180419921875,
"step": 2920
},
{
"epoch": 1.0359526799286234,
"grad_norm": 3.5974185568251706,
"learning_rate": 2.360317292089142e-06,
"logits/chosen": -0.7770583033561707,
"logits/rejected": -0.30780109763145447,
"logps/chosen": -113.72601318359375,
"logps/rejected": -452.1132507324219,
"loss": 0.815,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.973827362060547,
"rewards/margins": 10.13033390045166,
"rewards/rejected": -4.156505584716797,
"step": 2940
},
{
"epoch": 1.0430022250126672,
"grad_norm": 3.94979541076315,
"learning_rate": 2.3326952914107272e-06,
"logits/chosen": -0.7770149111747742,
"logits/rejected": -0.32130613923072815,
"logps/chosen": -109.78759765625,
"logps/rejected": -452.708984375,
"loss": 0.8016,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.9994988441467285,
"rewards/margins": 10.161717414855957,
"rewards/rejected": -4.162219047546387,
"step": 2960
},
{
"epoch": 1.050051770096711,
"grad_norm": 3.416366688616228,
"learning_rate": 2.3050937919044476e-06,
"logits/chosen": -0.7574631571769714,
"logits/rejected": -0.3230978548526764,
"logps/chosen": -112.95440673828125,
"logps/rejected": -453.5429382324219,
"loss": 0.8057,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.04919958114624,
"rewards/margins": 10.219966888427734,
"rewards/rejected": -4.170767307281494,
"step": 2980
},
{
"epoch": 1.0571013151807547,
"grad_norm": 3.5726005824550033,
"learning_rate": 2.277516175800829e-06,
"logits/chosen": -0.7706856727600098,
"logits/rejected": -0.3133412003517151,
"logps/chosen": -108.99263000488281,
"logps/rejected": -454.4375,
"loss": 0.7875,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.972984790802002,
"rewards/margins": 10.152580261230469,
"rewards/rejected": -4.179595470428467,
"step": 3000
},
{
"epoch": 1.0641508602647984,
"grad_norm": 3.644467271254268,
"learning_rate": 2.249965822403773e-06,
"logits/chosen": -0.785606324672699,
"logits/rejected": -0.30744919180870056,
"logps/chosen": -110.87646484375,
"logps/rejected": -454.81695556640625,
"loss": 0.8022,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.023101806640625,
"rewards/margins": 10.206517219543457,
"rewards/rejected": -4.183415412902832,
"step": 3020
},
{
"epoch": 1.0712004053488424,
"grad_norm": 3.5762988132387528,
"learning_rate": 2.2224461076764703e-06,
"logits/chosen": -0.7679362893104553,
"logits/rejected": -0.2972542941570282,
"logps/chosen": -112.6702880859375,
"logps/rejected": -454.98028564453125,
"loss": 0.8051,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.082765102386475,
"rewards/margins": 10.267889976501465,
"rewards/rejected": -4.18512487411499,
"step": 3040
},
{
"epoch": 1.0782499504328862,
"grad_norm": 3.765453501473496,
"learning_rate": 2.1949604038277085e-06,
"logits/chosen": -0.7607068419456482,
"logits/rejected": -0.29195815324783325,
"logps/chosen": -108.87700653076172,
"logps/rejected": -455.62042236328125,
"loss": 0.7928,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.96248197555542,
"rewards/margins": 10.153925895690918,
"rewards/rejected": -4.191443920135498,
"step": 3060
},
{
"epoch": 1.08529949551693,
"grad_norm": 3.675075817780507,
"learning_rate": 2.1675120788986524e-06,
"logits/chosen": -0.7517310380935669,
"logits/rejected": -0.2829591631889343,
"logps/chosen": -113.5805435180664,
"logps/rejected": -456.20068359375,
"loss": 0.8091,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.096705913543701,
"rewards/margins": 10.29398250579834,
"rewards/rejected": -4.197276592254639,
"step": 3080
},
{
"epoch": 1.0923490406009737,
"grad_norm": 3.510939604019646,
"learning_rate": 2.1401044963501353e-06,
"logits/chosen": -0.7697411775588989,
"logits/rejected": -0.28985849022865295,
"logps/chosen": -109.6702651977539,
"logps/rejected": -456.3376159667969,
"loss": 0.7883,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.059773921966553,
"rewards/margins": 10.258381843566895,
"rewards/rejected": -4.1986083984375,
"step": 3100
},
{
"epoch": 1.0993985856850175,
"grad_norm": 3.78162571016694,
"learning_rate": 2.1127410146505006e-06,
"logits/chosen": -0.7736045122146606,
"logits/rejected": -0.2909747064113617,
"logps/chosen": -111.208251953125,
"logps/rejected": -456.29913330078125,
"loss": 0.8031,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.034745216369629,
"rewards/margins": 10.23304271697998,
"rewards/rejected": -4.198297023773193,
"step": 3120
},
{
"epoch": 1.1064481307690612,
"grad_norm": 3.446885891663511,
"learning_rate": 2.0854249868640653e-06,
"logits/chosen": -0.7775657176971436,
"logits/rejected": -0.28920355439186096,
"logps/chosen": -111.19278717041016,
"logps/rejected": -456.51739501953125,
"loss": 0.8033,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.995384216308594,
"rewards/margins": 10.195777893066406,
"rewards/rejected": -4.2003936767578125,
"step": 3140
},
{
"epoch": 1.1134976758531052,
"grad_norm": 3.756805627719574,
"learning_rate": 2.0581597602402425e-06,
"logits/chosen": -0.7638501524925232,
"logits/rejected": -0.2594032287597656,
"logps/chosen": -115.24254608154297,
"logps/rejected": -456.53448486328125,
"loss": 0.8195,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.091201305389404,
"rewards/margins": 10.291844367980957,
"rewards/rejected": -4.200643062591553,
"step": 3160
},
{
"epoch": 1.120547220937149,
"grad_norm": 3.812655052084755,
"learning_rate": 2.0309486758033777e-06,
"logits/chosen": -0.7667771577835083,
"logits/rejected": -0.23339663445949554,
"logps/chosen": -112.5297622680664,
"logps/rejected": -456.4374084472656,
"loss": 0.8099,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.06128454208374,
"rewards/margins": 10.260945320129395,
"rewards/rejected": -4.199659824371338,
"step": 3180
},
{
"epoch": 1.1275967660211927,
"grad_norm": 3.9420982726472773,
"learning_rate": 2.0037950679433425e-06,
"logits/chosen": -0.7690004706382751,
"logits/rejected": -0.2527337074279785,
"logps/chosen": -113.76244354248047,
"logps/rejected": -456.77178955078125,
"loss": 0.8198,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.039876461029053,
"rewards/margins": 10.242877006530762,
"rewards/rejected": -4.203000545501709,
"step": 3200
},
{
"epoch": 1.1346463111052365,
"grad_norm": 3.335775619061638,
"learning_rate": 1.9767022640069493e-06,
"logits/chosen": -0.7629236578941345,
"logits/rejected": -0.24979268014431,
"logps/chosen": -111.00728607177734,
"logps/rejected": -457.5516052246094,
"loss": 0.7979,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.058557510375977,
"rewards/margins": 10.269355773925781,
"rewards/rejected": -4.210798740386963,
"step": 3220
},
{
"epoch": 1.1416958561892803,
"grad_norm": 3.5494177496277683,
"learning_rate": 1.9496735838902254e-06,
"logits/chosen": -0.7654642462730408,
"logits/rejected": -0.24192610383033752,
"logps/chosen": -112.31331634521484,
"logps/rejected": -457.4699401855469,
"loss": 0.8023,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.082005500793457,
"rewards/margins": 10.292023658752441,
"rewards/rejected": -4.210019111633301,
"step": 3240
},
{
"epoch": 1.148745401273324,
"grad_norm": 3.62036409136568,
"learning_rate": 1.922712339631595e-06,
"logits/chosen": -0.7688376903533936,
"logits/rejected": -0.2729661762714386,
"logps/chosen": -111.631591796875,
"logps/rejected": -457.541015625,
"loss": 0.8005,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.04392671585083,
"rewards/margins": 10.254591941833496,
"rewards/rejected": -4.210665702819824,
"step": 3260
},
{
"epoch": 1.1557949463573678,
"grad_norm": 3.8845526177462775,
"learning_rate": 1.895821835006033e-06,
"logits/chosen": -0.7550647854804993,
"logits/rejected": -0.26288357377052307,
"logps/chosen": -112.28546142578125,
"logps/rejected": -458.4576721191406,
"loss": 0.8046,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.07279634475708,
"rewards/margins": 10.29267406463623,
"rewards/rejected": -4.21987771987915,
"step": 3280
},
{
"epoch": 1.1628444914414118,
"grad_norm": 3.8168863101028276,
"learning_rate": 1.8690053651202278e-06,
"logits/chosen": -0.757368266582489,
"logits/rejected": -0.24931330978870392,
"logps/chosen": -110.4576187133789,
"logps/rejected": -458.091064453125,
"loss": 0.8051,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.992654323577881,
"rewards/margins": 10.208785057067871,
"rewards/rejected": -4.216129302978516,
"step": 3300
},
{
"epoch": 1.1698940365254555,
"grad_norm": 3.8170872257133874,
"learning_rate": 1.842266216008804e-06,
"logits/chosen": -0.7819269299507141,
"logits/rejected": -0.2846717834472656,
"logps/chosen": -110.04841613769531,
"logps/rejected": -458.6672058105469,
"loss": 0.8002,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.956142425537109,
"rewards/margins": 10.178072929382324,
"rewards/rejected": -4.221930980682373,
"step": 3320
},
{
"epoch": 1.1769435816094993,
"grad_norm": 3.623556472869695,
"learning_rate": 1.8156076642316566e-06,
"logits/chosen": -0.7666402459144592,
"logits/rejected": -0.27397018671035767,
"logps/chosen": -116.65936279296875,
"logps/rejected": -459.2618103027344,
"loss": 0.826,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.08809757232666,
"rewards/margins": 10.316171646118164,
"rewards/rejected": -4.228073596954346,
"step": 3340
},
{
"epoch": 1.183993126693543,
"grad_norm": 3.6258528177441933,
"learning_rate": 1.7890329764724522e-06,
"logits/chosen": -0.7601782083511353,
"logits/rejected": -0.2446957379579544,
"logps/chosen": -111.53218841552734,
"logps/rejected": -459.0636291503906,
"loss": 0.804,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.022862911224365,
"rewards/margins": 10.248769760131836,
"rewards/rejected": -4.225907325744629,
"step": 3360
},
{
"epoch": 1.1910426717775868,
"grad_norm": 3.737614878414752,
"learning_rate": 1.7625454091383348e-06,
"logits/chosen": -0.7394657135009766,
"logits/rejected": -0.24199071526527405,
"logps/chosen": -109.6101303100586,
"logps/rejected": -458.5634765625,
"loss": 0.7923,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.999507904052734,
"rewards/margins": 10.220422744750977,
"rewards/rejected": -4.220914363861084,
"step": 3380
},
{
"epoch": 1.1980922168616306,
"grad_norm": 3.6014758695213773,
"learning_rate": 1.7361482079608916e-06,
"logits/chosen": -0.7469202876091003,
"logits/rejected": -0.24424724280834198,
"logps/chosen": -111.15461730957031,
"logps/rejected": -459.6180725097656,
"loss": 0.8001,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.06093692779541,
"rewards/margins": 10.292411804199219,
"rewards/rejected": -4.231475353240967,
"step": 3400
},
{
"epoch": 1.2051417619456744,
"grad_norm": 3.7251487078814076,
"learning_rate": 1.7098446075984271e-06,
"logits/chosen": -0.7270140051841736,
"logits/rejected": -0.2223903387784958,
"logps/chosen": -110.1701431274414,
"logps/rejected": -460.193115234375,
"loss": 0.7937,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.080112934112549,
"rewards/margins": 10.3173246383667,
"rewards/rejected": -4.237212657928467,
"step": 3420
},
{
"epoch": 1.2121913070297183,
"grad_norm": 3.6824017522920425,
"learning_rate": 1.6836378312395985e-06,
"logits/chosen": -0.7231994271278381,
"logits/rejected": -0.2200583517551422,
"logps/chosen": -110.08992004394531,
"logps/rejected": -460.22802734375,
"loss": 0.7961,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.026852130889893,
"rewards/margins": 10.264342308044434,
"rewards/rejected": -4.237489223480225,
"step": 3440
},
{
"epoch": 1.219240852113762,
"grad_norm": 3.6649464025544725,
"learning_rate": 1.6575310902084486e-06,
"logits/chosen": -0.7727184295654297,
"logits/rejected": -0.2428928166627884,
"logps/chosen": -116.430419921875,
"logps/rejected": -460.3304748535156,
"loss": 0.8234,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.162951946258545,
"rewards/margins": 10.40161418914795,
"rewards/rejected": -4.238661766052246,
"step": 3460
},
{
"epoch": 1.2262903971978059,
"grad_norm": 3.516534580479379,
"learning_rate": 1.6315275835708968e-06,
"logits/chosen": -0.7715465426445007,
"logits/rejected": -0.2562185227870941,
"logps/chosen": -114.5073013305664,
"logps/rejected": -460.95611572265625,
"loss": 0.8189,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.068451881408691,
"rewards/margins": 10.313359260559082,
"rewards/rejected": -4.244908332824707,
"step": 3480
},
{
"epoch": 1.2333399422818496,
"grad_norm": 3.764498441728445,
"learning_rate": 1.6056304977427396e-06,
"logits/chosen": -0.7822647094726562,
"logits/rejected": -0.2754697799682617,
"logps/chosen": -108.8161849975586,
"logps/rejected": -460.80615234375,
"loss": 0.7835,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.099386692047119,
"rewards/margins": 10.342795372009277,
"rewards/rejected": -4.243407726287842,
"step": 3500
},
{
"epoch": 1.2403894873658934,
"grad_norm": 3.843560855128951,
"learning_rate": 1.579843006099182e-06,
"logits/chosen": -0.7731927633285522,
"logits/rejected": -0.2538703978061676,
"logps/chosen": -107.45731353759766,
"logps/rejected": -459.0361328125,
"loss": 0.7823,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.980363368988037,
"rewards/margins": 10.205938339233398,
"rewards/rejected": -4.225574493408203,
"step": 3520
},
{
"epoch": 1.2474390324499371,
"grad_norm": 3.852598571454623,
"learning_rate": 1.5541682685859877e-06,
"logits/chosen": -0.7549764513969421,
"logits/rejected": -0.23148897290229797,
"logps/chosen": -110.1994400024414,
"logps/rejected": -459.1859436035156,
"loss": 0.7968,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.038589000701904,
"rewards/margins": 10.265731811523438,
"rewards/rejected": -4.227142810821533,
"step": 3540
},
{
"epoch": 1.254488577533981,
"grad_norm": 3.7009734358554778,
"learning_rate": 1.5286094313322642e-06,
"logits/chosen": -0.784349799156189,
"logits/rejected": -0.25168928503990173,
"logps/chosen": -116.32988739013672,
"logps/rejected": -459.89044189453125,
"loss": 0.8291,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.14546537399292,
"rewards/margins": 10.379704475402832,
"rewards/rejected": -4.234239101409912,
"step": 3560
},
{
"epoch": 1.261538122618025,
"grad_norm": 3.6702836620131487,
"learning_rate": 1.5031696262649388e-06,
"logits/chosen": -0.7631133198738098,
"logits/rejected": -0.23987340927124023,
"logps/chosen": -108.21134185791016,
"logps/rejected": -460.1493225097656,
"loss": 0.7768,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.06735897064209,
"rewards/margins": 10.304133415222168,
"rewards/rejected": -4.236774444580078,
"step": 3580
},
{
"epoch": 1.2685876677020687,
"grad_norm": 3.7496096175509117,
"learning_rate": 1.4778519707249824e-06,
"logits/chosen": -0.7178513407707214,
"logits/rejected": -0.21321916580200195,
"logps/chosen": -108.5086669921875,
"logps/rejected": -460.8570251464844,
"loss": 0.7864,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.015305519104004,
"rewards/margins": 10.259174346923828,
"rewards/rejected": -4.243868827819824,
"step": 3600
},
{
"epoch": 1.2756372127861124,
"grad_norm": 3.5821190789210116,
"learning_rate": 1.452659567085416e-06,
"logits/chosen": -0.7405712008476257,
"logits/rejected": -0.22374410927295685,
"logps/chosen": -111.2579116821289,
"logps/rejected": -461.4320373535156,
"loss": 0.7931,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.068841934204102,
"rewards/margins": 10.318615913391113,
"rewards/rejected": -4.24977445602417,
"step": 3620
},
{
"epoch": 1.2826867578701562,
"grad_norm": 3.4834658925918993,
"learning_rate": 1.427595502371154e-06,
"logits/chosen": -0.7308667302131653,
"logits/rejected": -0.21432648599147797,
"logps/chosen": -112.19306945800781,
"logps/rejected": -461.59405517578125,
"loss": 0.8011,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.054664134979248,
"rewards/margins": 10.305948257446289,
"rewards/rejected": -4.251284122467041,
"step": 3640
},
{
"epoch": 1.2897363029542,
"grad_norm": 3.6201120344067763,
"learning_rate": 1.4026628478807245e-06,
"logits/chosen": -0.7650494575500488,
"logits/rejected": -0.2274860143661499,
"logps/chosen": -108.77252197265625,
"logps/rejected": -461.6319274902344,
"loss": 0.7948,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.976110458374023,
"rewards/margins": 10.2275972366333,
"rewards/rejected": -4.251486778259277,
"step": 3660
},
{
"epoch": 1.2967858480382437,
"grad_norm": 3.7346652227529877,
"learning_rate": 1.3778646588099156e-06,
"logits/chosen": -0.7402178049087524,
"logits/rejected": -0.2060566246509552,
"logps/chosen": -112.3010025024414,
"logps/rejected": -461.74835205078125,
"loss": 0.7965,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.0907111167907715,
"rewards/margins": 10.343549728393555,
"rewards/rejected": -4.252838134765625,
"step": 3680
},
{
"epoch": 1.3038353931222875,
"grad_norm": 3.768580088883866,
"learning_rate": 1.353203973877406e-06,
"logits/chosen": -0.7112205624580383,
"logits/rejected": -0.1953021138906479,
"logps/chosen": -110.20354461669922,
"logps/rejected": -461.6409606933594,
"loss": 0.7956,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.044336795806885,
"rewards/margins": 10.29607105255127,
"rewards/rejected": -4.251734256744385,
"step": 3700
},
{
"epoch": 1.3108849382063315,
"grad_norm": 3.4440781350790908,
"learning_rate": 1.328683814952401e-06,
"logits/chosen": -0.7040776014328003,
"logits/rejected": -0.18124286830425262,
"logps/chosen": -111.324462890625,
"logps/rejected": -461.9501037597656,
"loss": 0.7997,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.025957107543945,
"rewards/margins": 10.280824661254883,
"rewards/rejected": -4.254867076873779,
"step": 3720
},
{
"epoch": 1.3179344832903752,
"grad_norm": 3.657636143497304,
"learning_rate": 1.3043071866843393e-06,
"logits/chosen": -0.740128755569458,
"logits/rejected": -0.20634058117866516,
"logps/chosen": -116.50548553466797,
"logps/rejected": -462.2780456542969,
"loss": 0.8239,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.147490978240967,
"rewards/margins": 10.405622482299805,
"rewards/rejected": -4.258132457733154,
"step": 3740
},
{
"epoch": 1.324984028374419,
"grad_norm": 3.7645166487401998,
"learning_rate": 1.280077076134713e-06,
"logits/chosen": -0.7460483908653259,
"logits/rejected": -0.22827637195587158,
"logps/chosen": -111.3073959350586,
"logps/rejected": -462.13555908203125,
"loss": 0.8051,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.000453472137451,
"rewards/margins": 10.25708293914795,
"rewards/rejected": -4.256629467010498,
"step": 3760
},
{
"epoch": 1.3320335734584627,
"grad_norm": 3.6937410357737135,
"learning_rate": 1.2559964524110329e-06,
"logits/chosen": -0.7219620943069458,
"logits/rejected": -0.20633235573768616,
"logps/chosen": -111.94634246826172,
"logps/rejected": -462.30010986328125,
"loss": 0.8003,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.105963230133057,
"rewards/margins": 10.364226341247559,
"rewards/rejected": -4.258262634277344,
"step": 3780
},
{
"epoch": 1.3390831185425065,
"grad_norm": 3.5068780936908346,
"learning_rate": 1.2320682663030067e-06,
"logits/chosen": -0.7292844653129578,
"logits/rejected": -0.2123355120420456,
"logps/chosen": -110.45084381103516,
"logps/rejected": -462.43505859375,
"loss": 0.788,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.103517532348633,
"rewards/margins": 10.363227844238281,
"rewards/rejected": -4.259710788726807,
"step": 3800
},
{
"epoch": 1.3461326636265505,
"grad_norm": 4.0334399408644614,
"learning_rate": 1.208295449920953e-06,
"logits/chosen": -0.7406136393547058,
"logits/rejected": -0.21345853805541992,
"logps/chosen": -111.7240982055664,
"logps/rejected": -462.9325866699219,
"loss": 0.7966,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.093398571014404,
"rewards/margins": 10.35802173614502,
"rewards/rejected": -4.264623165130615,
"step": 3820
},
{
"epoch": 1.353182208710594,
"grad_norm": 3.8844200352095095,
"learning_rate": 1.1846809163365053e-06,
"logits/chosen": -0.7404711246490479,
"logits/rejected": -0.21118001639842987,
"logps/chosen": -114.276123046875,
"logps/rejected": -463.1968688964844,
"loss": 0.8175,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.063361644744873,
"rewards/margins": 10.330649375915527,
"rewards/rejected": -4.267287731170654,
"step": 3840
},
{
"epoch": 1.360231753794638,
"grad_norm": 3.6024931838167005,
"learning_rate": 1.1612275592256505e-06,
"logits/chosen": -0.7284431457519531,
"logits/rejected": -0.2048410028219223,
"logps/chosen": -109.4595947265625,
"logps/rejected": -463.3080139160156,
"loss": 0.7931,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.021738052368164,
"rewards/margins": 10.290040016174316,
"rewards/rejected": -4.2683024406433105,
"step": 3860
},
{
"epoch": 1.3672812988786818,
"grad_norm": 3.616015891805895,
"learning_rate": 1.137938252514146e-06,
"logits/chosen": -0.7418842315673828,
"logits/rejected": -0.21675625443458557,
"logps/chosen": -112.6479721069336,
"logps/rejected": -463.680908203125,
"loss": 0.7983,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.131333351135254,
"rewards/margins": 10.403575897216797,
"rewards/rejected": -4.272243022918701,
"step": 3880
},
{
"epoch": 1.3743308439627255,
"grad_norm": 3.5890363099718887,
"learning_rate": 1.1148158500253528e-06,
"logits/chosen": -0.7448534965515137,
"logits/rejected": -0.20057539641857147,
"logps/chosen": -111.8428955078125,
"logps/rejected": -464.0297546386719,
"loss": 0.8019,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.048543453216553,
"rewards/margins": 10.324110984802246,
"rewards/rejected": -4.275567531585693,
"step": 3900
},
{
"epoch": 1.3813803890467693,
"grad_norm": 3.840508286124758,
"learning_rate": 1.0918631851305357e-06,
"logits/chosen": -0.7599017024040222,
"logits/rejected": -0.20486782491207123,
"logps/chosen": -110.95333099365234,
"logps/rejected": -464.31695556640625,
"loss": 0.7993,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.010303020477295,
"rewards/margins": 10.288734436035156,
"rewards/rejected": -4.2784318923950195,
"step": 3920
},
{
"epoch": 1.388429934130813,
"grad_norm": 3.7059192007951807,
"learning_rate": 1.0690830704016624e-06,
"logits/chosen": -0.7338669300079346,
"logits/rejected": -0.19808827340602875,
"logps/chosen": -110.7686538696289,
"logps/rejected": -464.29425048828125,
"loss": 0.7951,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.024989128112793,
"rewards/margins": 10.303257942199707,
"rewards/rejected": -4.278269290924072,
"step": 3940
},
{
"epoch": 1.395479479214857,
"grad_norm": 3.7606971138852274,
"learning_rate": 1.0464782972667682e-06,
"logits/chosen": -0.7379679679870605,
"logits/rejected": -0.1948491781949997,
"logps/chosen": -108.2778091430664,
"logps/rejected": -464.478515625,
"loss": 0.7774,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.026111602783203,
"rewards/margins": 10.306151390075684,
"rewards/rejected": -4.280040740966797,
"step": 3960
},
{
"epoch": 1.4025290242989006,
"grad_norm": 3.816348534495336,
"learning_rate": 1.0240516356678853e-06,
"logits/chosen": -0.7188676595687866,
"logits/rejected": -0.20450268685817719,
"logps/chosen": -106.01759338378906,
"logps/rejected": -464.5401306152344,
"loss": 0.7649,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.007740020751953,
"rewards/margins": 10.288371086120605,
"rewards/rejected": -4.2806315422058105,
"step": 3980
},
{
"epoch": 1.4095785693829446,
"grad_norm": 3.612107514399364,
"learning_rate": 1.0018058337216327e-06,
"logits/chosen": -0.7266545295715332,
"logits/rejected": -0.19752562046051025,
"logps/chosen": -110.18603515625,
"logps/rejected": -464.22833251953125,
"loss": 0.7897,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.06878137588501,
"rewards/margins": 10.34636116027832,
"rewards/rejected": -4.277579307556152,
"step": 4000
},
{
"epoch": 1.4166281144669883,
"grad_norm": 3.7833872085921882,
"learning_rate": 9.797436173824606e-07,
"logits/chosen": -0.7136607766151428,
"logits/rejected": -0.1925588697195053,
"logps/chosen": -112.893310546875,
"logps/rejected": -463.73590087890625,
"loss": 0.8014,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.104711055755615,
"rewards/margins": 10.377421379089355,
"rewards/rejected": -4.272710800170898,
"step": 4020
},
{
"epoch": 1.423677659551032,
"grad_norm": 3.7145911218808876,
"learning_rate": 9.578676901086213e-07,
"logits/chosen": -0.7393223643302917,
"logits/rejected": -0.20907925069332123,
"logps/chosen": -107.60369110107422,
"logps/rejected": -463.6431579589844,
"loss": 0.7849,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.018612384796143,
"rewards/margins": 10.290209770202637,
"rewards/rejected": -4.271597385406494,
"step": 4040
},
{
"epoch": 1.4307272046350759,
"grad_norm": 3.753197231623771,
"learning_rate": 9.361807325308861e-07,
"logits/chosen": -0.7394221425056458,
"logits/rejected": -0.2052200883626938,
"logps/chosen": -110.26177215576172,
"logps/rejected": -463.85186767578125,
"loss": 0.7936,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.044154644012451,
"rewards/margins": 10.317965507507324,
"rewards/rejected": -4.273810863494873,
"step": 4060
},
{
"epoch": 1.4377767497191196,
"grad_norm": 3.62410702725855,
"learning_rate": 9.146854021240795e-07,
"logits/chosen": -0.7467309832572937,
"logits/rejected": -0.20924925804138184,
"logps/chosen": -109.57176971435547,
"logps/rejected": -464.32281494140625,
"loss": 0.7825,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.084084987640381,
"rewards/margins": 10.362741470336914,
"rewards/rejected": -4.278656482696533,
"step": 4080
},
{
"epoch": 1.4448262948031636,
"grad_norm": 3.592620796932219,
"learning_rate": 8.933843328814224e-07,
"logits/chosen": -0.7418180704116821,
"logits/rejected": -0.2005995362997055,
"logps/chosen": -106.8327865600586,
"logps/rejected": -464.5583190917969,
"loss": 0.7709,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.014178276062012,
"rewards/margins": 10.294994354248047,
"rewards/rejected": -4.280816555023193,
"step": 4100
},
{
"epoch": 1.4518758398872071,
"grad_norm": 3.5324680986915196,
"learning_rate": 8.722801349917806e-07,
"logits/chosen": -0.7462785840034485,
"logits/rejected": -0.19319048523902893,
"logps/chosen": -104.92723846435547,
"logps/rejected": -464.9132995605469,
"loss": 0.7672,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.995131015777588,
"rewards/margins": 10.279424667358398,
"rewards/rejected": -4.2842936515808105,
"step": 4120
},
{
"epoch": 1.4589253849712511,
"grad_norm": 3.6252755412533073,
"learning_rate": 8.513753945198072e-07,
"logits/chosen": -0.7537373900413513,
"logits/rejected": -0.19358740746974945,
"logps/chosen": -110.26961517333984,
"logps/rejected": -465.21759033203125,
"loss": 0.789,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.088963031768799,
"rewards/margins": 10.37643814086914,
"rewards/rejected": -4.287475109100342,
"step": 4140
},
{
"epoch": 1.465974930055295,
"grad_norm": 3.872665997590583,
"learning_rate": 8.306726730890638e-07,
"logits/chosen": -0.7590047717094421,
"logits/rejected": -0.19472454488277435,
"logps/chosen": -110.58663177490234,
"logps/rejected": -465.0445861816406,
"loss": 0.7871,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.122416973114014,
"rewards/margins": 10.408243179321289,
"rewards/rejected": -4.285826206207275,
"step": 4160
},
{
"epoch": 1.4730244751393387,
"grad_norm": 3.7102846367736086,
"learning_rate": 8.101745075681106e-07,
"logits/chosen": -0.7329391837120056,
"logits/rejected": -0.1909240186214447,
"logps/chosen": -110.4598159790039,
"logps/rejected": -465.27642822265625,
"loss": 0.7896,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.111295223236084,
"rewards/margins": 10.399382591247559,
"rewards/rejected": -4.288087368011475,
"step": 4180
},
{
"epoch": 1.4800740202233824,
"grad_norm": 3.906032833336107,
"learning_rate": 7.898834097596553e-07,
"logits/chosen": -0.7385097742080688,
"logits/rejected": -0.19473496079444885,
"logps/chosen": -107.56981658935547,
"logps/rejected": -465.4288024902344,
"loss": 0.7791,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.026830196380615,
"rewards/margins": 10.316374778747559,
"rewards/rejected": -4.289544105529785,
"step": 4200
},
{
"epoch": 1.4871235653074262,
"grad_norm": 3.783514582812726,
"learning_rate": 7.698018660927562e-07,
"logits/chosen": -0.7368226647377014,
"logits/rejected": -0.19908547401428223,
"logps/chosen": -113.80726623535156,
"logps/rejected": -465.6442565917969,
"loss": 0.8112,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.0709099769592285,
"rewards/margins": 10.362727165222168,
"rewards/rejected": -4.291816711425781,
"step": 4220
},
{
"epoch": 1.4941731103914702,
"grad_norm": 3.463238510062497,
"learning_rate": 7.499323373181394e-07,
"logits/chosen": -0.7456727027893066,
"logits/rejected": -0.2014772891998291,
"logps/chosen": -110.4579849243164,
"logps/rejected": -465.69384765625,
"loss": 0.7934,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.0509185791015625,
"rewards/margins": 10.34317398071289,
"rewards/rejected": -4.29225492477417,
"step": 4240
},
{
"epoch": 1.5012226554755137,
"grad_norm": 3.64100892141048,
"learning_rate": 7.302772582066686e-07,
"logits/chosen": -0.7430599331855774,
"logits/rejected": -0.20189471542835236,
"logps/chosen": -107.80598449707031,
"logps/rejected": -465.7997741699219,
"loss": 0.7769,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.060474395751953,
"rewards/margins": 10.353759765625,
"rewards/rejected": -4.293285369873047,
"step": 4260
},
{
"epoch": 1.5082722005595577,
"grad_norm": 3.821344654148155,
"learning_rate": 7.108390372509894e-07,
"logits/chosen": -0.7576232552528381,
"logits/rejected": -0.20257721841335297,
"logps/chosen": -107.84185028076172,
"logps/rejected": -466.0832214355469,
"loss": 0.7835,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.016088008880615,
"rewards/margins": 10.312129020690918,
"rewards/rejected": -4.296041011810303,
"step": 4280
},
{
"epoch": 1.5153217456436014,
"grad_norm": 3.6781483222864537,
"learning_rate": 6.916200563703987e-07,
"logits/chosen": -0.7738979458808899,
"logits/rejected": -0.2092943638563156,
"logps/chosen": -110.91960906982422,
"logps/rejected": -466.02197265625,
"loss": 0.7967,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.113348007202148,
"rewards/margins": 10.408868789672852,
"rewards/rejected": -4.295520782470703,
"step": 4300
},
{
"epoch": 1.5223712907276452,
"grad_norm": 3.8463727860471097,
"learning_rate": 6.726226706189668e-07,
"logits/chosen": -0.7589637637138367,
"logits/rejected": -0.20386461913585663,
"logps/chosen": -108.56011962890625,
"logps/rejected": -466.29669189453125,
"loss": 0.7797,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.9991984367370605,
"rewards/margins": 10.297418594360352,
"rewards/rejected": -4.298220157623291,
"step": 4320
},
{
"epoch": 1.529420835811689,
"grad_norm": 3.7546467954048284,
"learning_rate": 6.538492078969611e-07,
"logits/chosen": -0.7622847557067871,
"logits/rejected": -0.2044745236635208,
"logps/chosen": -109.2506103515625,
"logps/rejected": -466.3732604980469,
"loss": 0.7839,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.045871257781982,
"rewards/margins": 10.344886779785156,
"rewards/rejected": -4.299015522003174,
"step": 4340
},
{
"epoch": 1.5364703808957327,
"grad_norm": 3.75404337894338,
"learning_rate": 6.353019686655806e-07,
"logits/chosen": -0.7753379940986633,
"logits/rejected": -0.20546992123126984,
"logps/chosen": -110.1863021850586,
"logps/rejected": -466.5959167480469,
"loss": 0.7903,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.081169128417969,
"rewards/margins": 10.382391929626465,
"rewards/rejected": -4.301222801208496,
"step": 4360
},
{
"epoch": 1.5435199259797767,
"grad_norm": 3.5216418635854274,
"learning_rate": 6.169832256650698e-07,
"logits/chosen": -0.7565549612045288,
"logits/rejected": -0.20086073875427246,
"logps/chosen": -104.84516906738281,
"logps/rejected": -466.4576721191406,
"loss": 0.7601,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.991360664367676,
"rewards/margins": 10.291136741638184,
"rewards/rejected": -4.299776554107666,
"step": 4380
},
{
"epoch": 1.5505694710638203,
"grad_norm": 3.5037838198076003,
"learning_rate": 5.988952236362153e-07,
"logits/chosen": -0.7642261385917664,
"logits/rejected": -0.2072605937719345,
"logps/chosen": -107.33918762207031,
"logps/rejected": -466.7052917480469,
"loss": 0.7674,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.061375617980957,
"rewards/margins": 10.363710403442383,
"rewards/rejected": -4.302334785461426,
"step": 4400
},
{
"epoch": 1.5576190161478642,
"grad_norm": 3.864008710250573,
"learning_rate": 5.810401790452888e-07,
"logits/chosen": -0.7727814316749573,
"logits/rejected": -0.21775054931640625,
"logps/chosen": -107.37860107421875,
"logps/rejected": -466.77545166015625,
"loss": 0.7793,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.998612403869629,
"rewards/margins": 10.301616668701172,
"rewards/rejected": -4.303005218505859,
"step": 4420
},
{
"epoch": 1.564668561231908,
"grad_norm": 3.809694724002222,
"learning_rate": 5.63420279812435e-07,
"logits/chosen": -0.7627360224723816,
"logits/rejected": -0.22790618240833282,
"logps/chosen": -106.657470703125,
"logps/rejected": -466.9906921386719,
"loss": 0.7697,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.044661998748779,
"rewards/margins": 10.349881172180176,
"rewards/rejected": -4.305219650268555,
"step": 4440
},
{
"epoch": 1.5717181063159518,
"grad_norm": 3.810418330921462,
"learning_rate": 5.460376850435775e-07,
"logits/chosen": -0.771653950214386,
"logits/rejected": -0.2361011505126953,
"logps/chosen": -112.78193664550781,
"logps/rejected": -467.257568359375,
"loss": 0.8037,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.097472667694092,
"rewards/margins": 10.40538501739502,
"rewards/rejected": -4.307912349700928,
"step": 4460
},
{
"epoch": 1.5787676513999958,
"grad_norm": 3.622523471695452,
"learning_rate": 5.288945247658411e-07,
"logits/chosen": -0.7521407008171082,
"logits/rejected": -0.22493302822113037,
"logps/chosen": -112.21357727050781,
"logps/rejected": -467.2759704589844,
"loss": 0.803,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.095398902893066,
"rewards/margins": 10.403505325317383,
"rewards/rejected": -4.308106899261475,
"step": 4480
},
{
"epoch": 1.5858171964840393,
"grad_norm": 3.891672529591952,
"learning_rate": 5.11992899666546e-07,
"logits/chosen": -0.7626200914382935,
"logits/rejected": -0.2184225618839264,
"logps/chosen": -107.69268035888672,
"logps/rejected": -466.9710998535156,
"loss": 0.7866,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.997985363006592,
"rewards/margins": 10.302816390991211,
"rewards/rejected": -4.3048319816589355,
"step": 4500
},
{
"epoch": 1.5928667415680833,
"grad_norm": 3.6231664217360335,
"learning_rate": 4.953348808357897e-07,
"logits/chosen": -0.7735008597373962,
"logits/rejected": -0.22704847157001495,
"logps/chosen": -108.8917007446289,
"logps/rejected": -466.9140319824219,
"loss": 0.7833,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.050678253173828,
"rewards/margins": 10.355135917663574,
"rewards/rejected": -4.304458141326904,
"step": 4520
},
{
"epoch": 1.5999162866521268,
"grad_norm": 3.503524696607652,
"learning_rate": 4.789225095126665e-07,
"logits/chosen": -0.7676795125007629,
"logits/rejected": -0.2262260913848877,
"logps/chosen": -110.2127456665039,
"logps/rejected": -467.002685546875,
"loss": 0.7868,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.059360027313232,
"rewards/margins": 10.364716529846191,
"rewards/rejected": -4.305356502532959,
"step": 4540
},
{
"epoch": 1.6069658317361708,
"grad_norm": 3.5468237051894795,
"learning_rate": 4.6275779683513044e-07,
"logits/chosen": -0.7663300633430481,
"logits/rejected": -0.22453832626342773,
"logps/chosen": -110.7569580078125,
"logps/rejected": -467.04058837890625,
"loss": 0.7952,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.11013650894165,
"rewards/margins": 10.415862083435059,
"rewards/rejected": -4.305725574493408,
"step": 4560
},
{
"epoch": 1.6140153768202146,
"grad_norm": 3.9447717460332745,
"learning_rate": 4.46842723593561e-07,
"logits/chosen": -0.7553516030311584,
"logits/rejected": -0.22027336061000824,
"logps/chosen": -108.72711181640625,
"logps/rejected": -467.15625,
"loss": 0.786,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.020168304443359,
"rewards/margins": 10.327046394348145,
"rewards/rejected": -4.306877613067627,
"step": 4580
},
{
"epoch": 1.6210649219042583,
"grad_norm": 3.642004095627008,
"learning_rate": 4.311792399880382e-07,
"logits/chosen": -0.7744750380516052,
"logits/rejected": -0.22373166680335999,
"logps/chosen": -105.97419738769531,
"logps/rejected": -467.371337890625,
"loss": 0.7647,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.069406509399414,
"rewards/margins": 10.37836742401123,
"rewards/rejected": -4.308960914611816,
"step": 4600
},
{
"epoch": 1.6281144669883023,
"grad_norm": 3.637753232067399,
"learning_rate": 4.1576926538936993e-07,
"logits/chosen": -0.7700881958007812,
"logits/rejected": -0.22813375294208527,
"logps/chosen": -107.51484680175781,
"logps/rejected": -467.3663635253906,
"loss": 0.7771,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.9894490242004395,
"rewards/margins": 10.298357963562012,
"rewards/rejected": -4.308909893035889,
"step": 4620
},
{
"epoch": 1.6351640120723459,
"grad_norm": 3.7321362303897905,
"learning_rate": 4.006146881038947e-07,
"logits/chosen": -0.7837023735046387,
"logits/rejected": -0.23206615447998047,
"logps/chosen": -109.95082092285156,
"logps/rejected": -467.3857116699219,
"loss": 0.7856,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.083016395568848,
"rewards/margins": 10.39222240447998,
"rewards/rejected": -4.309206008911133,
"step": 4640
},
{
"epoch": 1.6422135571563898,
"grad_norm": 3.761528498742703,
"learning_rate": 3.8571736514209477e-07,
"logits/chosen": -0.790883481502533,
"logits/rejected": -0.23554182052612305,
"logps/chosen": -108.13374328613281,
"logps/rejected": -467.5670471191406,
"loss": 0.7791,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.0900559425354,
"rewards/margins": 10.40097713470459,
"rewards/rejected": -4.310921669006348,
"step": 4660
},
{
"epoch": 1.6492631022404336,
"grad_norm": 3.8270268639175664,
"learning_rate": 3.710791219910409e-07,
"logits/chosen": -0.7919120788574219,
"logits/rejected": -0.23967795073986053,
"logps/chosen": -108.26678466796875,
"logps/rejected": -467.712158203125,
"loss": 0.7764,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.098334789276123,
"rewards/margins": 10.41071605682373,
"rewards/rejected": -4.312382698059082,
"step": 4680
},
{
"epoch": 1.6563126473244774,
"grad_norm": 3.736170667753776,
"learning_rate": 3.567017523907018e-07,
"logits/chosen": -0.7862281203269958,
"logits/rejected": -0.2402174472808838,
"logps/chosen": -106.91987609863281,
"logps/rejected": -467.6073303222656,
"loss": 0.7763,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.0048723220825195,
"rewards/margins": 10.316206932067871,
"rewards/rejected": -4.311334133148193,
"step": 4700
},
{
"epoch": 1.6633621924085211,
"grad_norm": 3.804063902145054,
"learning_rate": 3.425870181141394e-07,
"logits/chosen": -0.774181067943573,
"logits/rejected": -0.23379312455654144,
"logps/chosen": -109.593505859375,
"logps/rejected": -467.6258850097656,
"loss": 0.7874,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.03621768951416,
"rewards/margins": 10.347805976867676,
"rewards/rejected": -4.311588287353516,
"step": 4720
},
{
"epoch": 1.6704117374925649,
"grad_norm": 3.6994558144300425,
"learning_rate": 3.2873664875162997e-07,
"logits/chosen": -0.7831236720085144,
"logits/rejected": -0.22968469560146332,
"logps/chosen": -109.0361099243164,
"logps/rejected": -467.54327392578125,
"loss": 0.7804,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.145629405975342,
"rewards/margins": 10.456382751464844,
"rewards/rejected": -4.310754299163818,
"step": 4740
},
{
"epoch": 1.6774612825766089,
"grad_norm": 3.7255378247554765,
"learning_rate": 3.1515234149872123e-07,
"logits/chosen": -0.7805240154266357,
"logits/rejected": -0.2411789894104004,
"logps/chosen": -110.95159912109375,
"logps/rejected": -467.4502868652344,
"loss": 0.8005,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.015338897705078,
"rewards/margins": 10.325087547302246,
"rewards/rejected": -4.30974817276001,
"step": 4760
},
{
"epoch": 1.6845108276606524,
"grad_norm": 3.8920066879065813,
"learning_rate": 3.018357609482603e-07,
"logits/chosen": -0.784945011138916,
"logits/rejected": -0.24280527234077454,
"logps/chosen": -108.68704986572266,
"logps/rejected": -467.4269104003906,
"loss": 0.7854,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.011569499969482,
"rewards/margins": 10.32103443145752,
"rewards/rejected": -4.309464931488037,
"step": 4780
},
{
"epoch": 1.6915603727446964,
"grad_norm": 3.72619208545696,
"learning_rate": 2.887885388864206e-07,
"logits/chosen": -0.7819967269897461,
"logits/rejected": -0.24671606719493866,
"logps/chosen": -109.5833511352539,
"logps/rejected": -467.4178161621094,
"loss": 0.7849,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.126197814941406,
"rewards/margins": 10.435675621032715,
"rewards/rejected": -4.309477806091309,
"step": 4800
},
{
"epoch": 1.6986099178287402,
"grad_norm": 3.7736322479661344,
"learning_rate": 2.760122740927429e-07,
"logits/chosen": -0.7813412547111511,
"logits/rejected": -0.24587781727313995,
"logps/chosen": -111.07955169677734,
"logps/rejected": -467.5440368652344,
"loss": 0.7956,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.071666717529297,
"rewards/margins": 10.382463455200195,
"rewards/rejected": -4.31079626083374,
"step": 4820
},
{
"epoch": 1.705659462912784,
"grad_norm": 3.7894965626963213,
"learning_rate": 2.6350853214422825e-07,
"logits/chosen": -0.7834030985832214,
"logits/rejected": -0.248988538980484,
"logps/chosen": -108.4309310913086,
"logps/rejected": -467.5397644042969,
"loss": 0.7789,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.071558952331543,
"rewards/margins": 10.382250785827637,
"rewards/rejected": -4.310691833496094,
"step": 4840
},
{
"epoch": 1.7127090079968277,
"grad_norm": 3.611336149501781,
"learning_rate": 2.512788452234921e-07,
"logits/chosen": -0.7827231287956238,
"logits/rejected": -0.24999628961086273,
"logps/chosen": -110.13957977294922,
"logps/rejected": -467.6318359375,
"loss": 0.7913,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.0582194328308105,
"rewards/margins": 10.369796752929688,
"rewards/rejected": -4.311577796936035,
"step": 4860
},
{
"epoch": 1.7197585530808714,
"grad_norm": 3.5996393260710535,
"learning_rate": 2.3932471193101546e-07,
"logits/chosen": -0.7880483865737915,
"logits/rejected": -0.2517296075820923,
"logps/chosen": -110.41153717041016,
"logps/rejected": -467.8431091308594,
"loss": 0.7877,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.099769115447998,
"rewards/margins": 10.413488388061523,
"rewards/rejected": -4.313718318939209,
"step": 4880
},
{
"epoch": 1.7268080981649154,
"grad_norm": 3.705968217028357,
"learning_rate": 2.2764759710150768e-07,
"logits/chosen": -0.7876673936843872,
"logits/rejected": -0.24836230278015137,
"logps/chosen": -107.29426574707031,
"logps/rejected": -467.8483581542969,
"loss": 0.7781,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.002329349517822,
"rewards/margins": 10.31602954864502,
"rewards/rejected": -4.313700199127197,
"step": 4900
},
{
"epoch": 1.733857643248959,
"grad_norm": 3.9354283285524443,
"learning_rate": 2.1624893162441179e-07,
"logits/chosen": -0.7937143445014954,
"logits/rejected": -0.24714671075344086,
"logps/chosen": -111.57144927978516,
"logps/rejected": -467.8736267089844,
"loss": 0.7945,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.093102931976318,
"rewards/margins": 10.407225608825684,
"rewards/rejected": -4.314122200012207,
"step": 4920
},
{
"epoch": 1.740907188333003,
"grad_norm": 3.6111104668103904,
"learning_rate": 2.0513011226856338e-07,
"logits/chosen": -0.7964981198310852,
"logits/rejected": -0.24506263434886932,
"logps/chosen": -107.6418228149414,
"logps/rejected": -467.8564147949219,
"loss": 0.7846,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.011826992034912,
"rewards/margins": 10.3255615234375,
"rewards/rejected": -4.313735485076904,
"step": 4940
},
{
"epoch": 1.7479567334170467,
"grad_norm": 3.9897264942811193,
"learning_rate": 1.94292501511035e-07,
"logits/chosen": -0.7860663533210754,
"logits/rejected": -0.24424245953559875,
"logps/chosen": -109.5399398803711,
"logps/rejected": -467.9327697753906,
"loss": 0.7766,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.140701770782471,
"rewards/margins": 10.455405235290527,
"rewards/rejected": -4.31470251083374,
"step": 4960
},
{
"epoch": 1.7550062785010905,
"grad_norm": 3.6959213997496243,
"learning_rate": 1.8373742737017975e-07,
"logits/chosen": -0.7847122550010681,
"logits/rejected": -0.24023690819740295,
"logps/chosen": -105.888671875,
"logps/rejected": -467.9982604980469,
"loss": 0.7668,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.049968719482422,
"rewards/margins": 10.365229606628418,
"rewards/rejected": -4.315260410308838,
"step": 4980
},
{
"epoch": 1.7620558235851342,
"grad_norm": 3.7782181080865986,
"learning_rate": 1.7346618324290105e-07,
"logits/chosen": -0.7863363027572632,
"logits/rejected": -0.24366001784801483,
"logps/chosen": -108.9438247680664,
"logps/rejected": -468.1618347167969,
"loss": 0.778,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.094727039337158,
"rewards/margins": 10.411661148071289,
"rewards/rejected": -4.316934585571289,
"step": 5000
},
{
"epoch": 1.769105368669178,
"grad_norm": 3.483830010875064,
"learning_rate": 1.634800277461593e-07,
"logits/chosen": -0.7824100255966187,
"logits/rejected": -0.24446825683116913,
"logps/chosen": -110.3822021484375,
"logps/rejected": -468.2240295410156,
"loss": 0.7862,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.0525736808776855,
"rewards/margins": 10.370158195495605,
"rewards/rejected": -4.31758451461792,
"step": 5020
},
{
"epoch": 1.776154913753222,
"grad_norm": 3.634525654068984,
"learning_rate": 1.5378018456274568e-07,
"logits/chosen": -0.7856306433677673,
"logits/rejected": -0.24479059875011444,
"logps/chosen": -109.49385070800781,
"logps/rejected": -468.20947265625,
"loss": 0.7786,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.074910640716553,
"rewards/margins": 10.392340660095215,
"rewards/rejected": -4.317430019378662,
"step": 5040
},
{
"epoch": 1.7832044588372655,
"grad_norm": 3.5533845133528943,
"learning_rate": 1.4436784229133444e-07,
"logits/chosen": -0.7944580912590027,
"logits/rejected": -0.24189452826976776,
"logps/chosen": -106.3443832397461,
"logps/rejected": -468.205078125,
"loss": 0.7666,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.092994213104248,
"rewards/margins": 10.410300254821777,
"rewards/rejected": -4.317306041717529,
"step": 5060
},
{
"epoch": 1.7902540039213095,
"grad_norm": 3.761310621400386,
"learning_rate": 1.35244154300834e-07,
"logits/chosen": -0.7947224974632263,
"logits/rejected": -0.24308596551418304,
"logps/chosen": -106.94145965576172,
"logps/rejected": -468.1993103027344,
"loss": 0.7762,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.0086212158203125,
"rewards/margins": 10.32580280303955,
"rewards/rejected": -4.3171820640563965,
"step": 5080
},
{
"epoch": 1.7973035490053533,
"grad_norm": 3.8649700277917898,
"learning_rate": 1.2641023858905476e-07,
"logits/chosen": -0.7907156348228455,
"logits/rejected": -0.24166785180568695,
"logps/chosen": -110.41487121582031,
"logps/rejected": -468.1279296875,
"loss": 0.7962,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.092523097991943,
"rewards/margins": 10.40918254852295,
"rewards/rejected": -4.316659450531006,
"step": 5100
},
{
"epoch": 1.804353094089397,
"grad_norm": 3.8300285347879295,
"learning_rate": 1.1786717764571464e-07,
"logits/chosen": -0.7850465178489685,
"logits/rejected": -0.23768572509288788,
"logps/chosen": -107.66456604003906,
"logps/rejected": -468.2193908691406,
"loss": 0.7835,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.02831506729126,
"rewards/margins": 10.345748901367188,
"rewards/rejected": -4.317434310913086,
"step": 5120
},
{
"epoch": 1.8114026391734408,
"grad_norm": 3.729330289057156,
"learning_rate": 1.0961601831978946e-07,
"logits/chosen": -0.7787120938301086,
"logits/rejected": -0.242794468998909,
"logps/chosen": -110.7241439819336,
"logps/rejected": -468.2672424316406,
"loss": 0.7869,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.073513031005859,
"rewards/margins": 10.3915433883667,
"rewards/rejected": -4.318031311035156,
"step": 5140
},
{
"epoch": 1.8184521842574846,
"grad_norm": 3.765132850408836,
"learning_rate": 1.0165777169123703e-07,
"logits/chosen": -0.7908723950386047,
"logits/rejected": -0.2450651377439499,
"logps/chosen": -107.59815979003906,
"logps/rejected": -468.2784118652344,
"loss": 0.7814,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.020245552062988,
"rewards/margins": 10.338305473327637,
"rewards/rejected": -4.318060398101807,
"step": 5160
},
{
"epoch": 1.8255017293415285,
"grad_norm": 3.8602406164077854,
"learning_rate": 9.399341294709957e-08,
"logits/chosen": -0.7792836427688599,
"logits/rejected": -0.24651508033275604,
"logps/chosen": -108.40166473388672,
"logps/rejected": -468.3190612792969,
"loss": 0.7774,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.076695919036865,
"rewards/margins": 10.395309448242188,
"rewards/rejected": -4.318613529205322,
"step": 5180
},
{
"epoch": 1.832551274425572,
"grad_norm": 3.700379682071922,
"learning_rate": 8.662388126200877e-08,
"logits/chosen": -0.7963529825210571,
"logits/rejected": -0.2502332627773285,
"logps/chosen": -110.1925277709961,
"logps/rejected": -468.2157287597656,
"loss": 0.782,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.111256122589111,
"rewards/margins": 10.428763389587402,
"rewards/rejected": -4.317507266998291,
"step": 5200
},
{
"epoch": 1.839600819509616,
"grad_norm": 3.933863492006353,
"learning_rate": 7.955007968309835e-08,
"logits/chosen": -0.7933685779571533,
"logits/rejected": -0.24959242343902588,
"logps/chosen": -109.4911117553711,
"logps/rejected": -468.35675048828125,
"loss": 0.7803,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.192666053771973,
"rewards/margins": 10.511618614196777,
"rewards/rejected": -4.318951606750488,
"step": 5220
},
{
"epoch": 1.8466503645936598,
"grad_norm": 3.7362448675611453,
"learning_rate": 7.277287501934794e-08,
"logits/chosen": -0.7918945550918579,
"logits/rejected": -0.24759231507778168,
"logps/chosen": -107.86492156982422,
"logps/rejected": -468.30352783203125,
"loss": 0.7757,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.056914806365967,
"rewards/margins": 10.37521743774414,
"rewards/rejected": -4.31830358505249,
"step": 5240
},
{
"epoch": 1.8536999096777036,
"grad_norm": 3.888128836007745,
"learning_rate": 6.629309773536735e-08,
"logits/chosen": -0.7919878363609314,
"logits/rejected": -0.24335601925849915,
"logps/chosen": -111.12847137451172,
"logps/rejected": -468.2657165527344,
"loss": 0.7844,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.172073841094971,
"rewards/margins": 10.490161895751953,
"rewards/rejected": -4.318088531494141,
"step": 5260
},
{
"epoch": 1.8607494547617474,
"grad_norm": 3.952148287689068,
"learning_rate": 6.011154184963092e-08,
"logits/chosen": -0.7919384241104126,
"logits/rejected": -0.23931005597114563,
"logps/chosen": -105.4823226928711,
"logps/rejected": -468.26806640625,
"loss": 0.7644,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.060011863708496,
"rewards/margins": 10.377952575683594,
"rewards/rejected": -4.317941188812256,
"step": 5280
},
{
"epoch": 1.8677989998457911,
"grad_norm": 3.748301603574638,
"learning_rate": 5.422896483718077e-08,
"logits/chosen": -0.796483039855957,
"logits/rejected": -0.24163733422756195,
"logps/chosen": -108.889892578125,
"logps/rejected": -468.3476257324219,
"loss": 0.786,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.064844608306885,
"rewards/margins": 10.383631706237793,
"rewards/rejected": -4.318787574768066,
"step": 5300
},
{
"epoch": 1.874848544929835,
"grad_norm": 3.7690566013848623,
"learning_rate": 4.864608753680861e-08,
"logits/chosen": -0.7904632091522217,
"logits/rejected": -0.23976294696331024,
"logps/chosen": -109.89836883544922,
"logps/rejected": -468.2937927246094,
"loss": 0.7855,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.115835666656494,
"rewards/margins": 10.434123039245605,
"rewards/rejected": -4.318286895751953,
"step": 5320
},
{
"epoch": 1.8818980900138786,
"grad_norm": 3.7426789606880466,
"learning_rate": 4.3363594062724444e-08,
"logits/chosen": -0.7964286804199219,
"logits/rejected": -0.2432793378829956,
"logps/chosen": -112.6116714477539,
"logps/rejected": -468.3102722167969,
"loss": 0.7944,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.161302089691162,
"rewards/margins": 10.479842185974121,
"rewards/rejected": -4.318540096282959,
"step": 5340
},
{
"epoch": 1.8889476350979226,
"grad_norm": 3.7547604537404653,
"learning_rate": 3.838213172072669e-08,
"logits/chosen": -0.7900466322898865,
"logits/rejected": -0.24007217586040497,
"logps/chosen": -112.05464172363281,
"logps/rejected": -468.3132019042969,
"loss": 0.7994,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.090831756591797,
"rewards/margins": 10.40927791595459,
"rewards/rejected": -4.318446636199951,
"step": 5360
},
{
"epoch": 1.8959971801819664,
"grad_norm": 3.8913448082346416,
"learning_rate": 3.370231092888365e-08,
"logits/chosen": -0.7889196872711182,
"logits/rejected": -0.23926615715026855,
"logps/chosen": -113.01226043701172,
"logps/rejected": -468.3741760253906,
"loss": 0.8007,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.118028163909912,
"rewards/margins": 10.437150955200195,
"rewards/rejected": -4.319122314453125,
"step": 5380
},
{
"epoch": 1.9030467252660102,
"grad_norm": 3.9082849434048708,
"learning_rate": 2.9324705142732557e-08,
"logits/chosen": -0.7887745499610901,
"logits/rejected": -0.24104784429073334,
"logps/chosen": -108.8088150024414,
"logps/rejected": -468.35888671875,
"loss": 0.7816,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.0397725105285645,
"rewards/margins": 10.358664512634277,
"rewards/rejected": -4.318892002105713,
"step": 5400
},
{
"epoch": 1.9100962703500541,
"grad_norm": 3.774912938041044,
"learning_rate": 2.5249850785010743e-08,
"logits/chosen": -0.7838408350944519,
"logits/rejected": -0.24242563545703888,
"logps/chosen": -109.24266815185547,
"logps/rejected": -468.3182067871094,
"loss": 0.7807,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.064825534820557,
"rewards/margins": 10.383365631103516,
"rewards/rejected": -4.318539619445801,
"step": 5420
},
{
"epoch": 1.9171458154340977,
"grad_norm": 3.690894820094116,
"learning_rate": 2.1478247179923527e-08,
"logits/chosen": -0.7918921113014221,
"logits/rejected": -0.24019639194011688,
"logps/chosen": -108.5796890258789,
"logps/rejected": -468.2789611816406,
"loss": 0.7817,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.0598978996276855,
"rewards/margins": 10.377920150756836,
"rewards/rejected": -4.318021297454834,
"step": 5440
},
{
"epoch": 1.9241953605181417,
"grad_norm": 3.8565463435143443,
"learning_rate": 1.8010356491957038e-08,
"logits/chosen": -0.7949087023735046,
"logits/rejected": -0.2375962734222412,
"logps/chosen": -105.0801773071289,
"logps/rejected": -468.4278259277344,
"loss": 0.7635,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.9956889152526855,
"rewards/margins": 10.31513500213623,
"rewards/rejected": -4.319445610046387,
"step": 5460
},
{
"epoch": 1.9312449056021852,
"grad_norm": 3.7684906309960877,
"learning_rate": 1.484660366924684e-08,
"logits/chosen": -0.794578492641449,
"logits/rejected": -0.23898427188396454,
"logps/chosen": -109.71996307373047,
"logps/rejected": -468.4088439941406,
"loss": 0.7866,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.100671291351318,
"rewards/margins": 10.420153617858887,
"rewards/rejected": -4.319482326507568,
"step": 5480
},
{
"epoch": 1.9382944506862292,
"grad_norm": 3.719249551184422,
"learning_rate": 1.1987376391504601e-08,
"logits/chosen": -0.7956030964851379,
"logits/rejected": -0.240992933511734,
"logps/chosen": -108.1368408203125,
"logps/rejected": -468.4326171875,
"loss": 0.7728,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.102418422698975,
"rewards/margins": 10.422101020812988,
"rewards/rejected": -4.319683074951172,
"step": 5500
},
{
"epoch": 1.945343995770273,
"grad_norm": 3.6257082177562685,
"learning_rate": 9.433025022513309e-09,
"logits/chosen": -0.8015807271003723,
"logits/rejected": -0.24043157696723938,
"logps/chosen": -108.60029602050781,
"logps/rejected": -468.419921875,
"loss": 0.7834,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.090754985809326,
"rewards/margins": 10.410210609436035,
"rewards/rejected": -4.319455146789551,
"step": 5520
},
{
"epoch": 1.9523935408543167,
"grad_norm": 3.5711504630557824,
"learning_rate": 7.183862567194111e-09,
"logits/chosen": -0.792350709438324,
"logits/rejected": -0.23879127204418182,
"logps/chosen": -107.3970718383789,
"logps/rejected": -468.49072265625,
"loss": 0.7745,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.004356384277344,
"rewards/margins": 10.32451057434082,
"rewards/rejected": -4.320154190063477,
"step": 5540
},
{
"epoch": 1.9594430859383607,
"grad_norm": 3.9728705486510756,
"learning_rate": 5.2401646332508884e-09,
"logits/chosen": -0.7877883315086365,
"logits/rejected": -0.24027414619922638,
"logps/chosen": -108.71870422363281,
"logps/rejected": -468.4919128417969,
"loss": 0.7832,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.043579578399658,
"rewards/margins": 10.363776206970215,
"rewards/rejected": -4.320196628570557,
"step": 5560
},
{
"epoch": 1.9664926310224042,
"grad_norm": 3.826376351654586,
"learning_rate": 3.6021693973992135e-09,
"logits/chosen": -0.7913434505462646,
"logits/rejected": -0.23852479457855225,
"logps/chosen": -110.75666809082031,
"logps/rejected": -468.4978942871094,
"loss": 0.7869,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.171904563903809,
"rewards/margins": 10.492271423339844,
"rewards/rejected": -4.320366382598877,
"step": 5580
},
{
"epoch": 1.9735421761064482,
"grad_norm": 3.5362233127077127,
"learning_rate": 2.2700775761791416e-09,
"logits/chosen": -0.7925167083740234,
"logits/rejected": -0.23842616379261017,
"logps/chosen": -113.69185638427734,
"logps/rejected": -468.528564453125,
"loss": 0.8076,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.163191318511963,
"rewards/margins": 10.483903884887695,
"rewards/rejected": -4.320712566375732,
"step": 5600
},
{
"epoch": 1.9805917211904918,
"grad_norm": 4.004084496355029,
"learning_rate": 1.2440524013607181e-09,
"logits/chosen": -0.7958860397338867,
"logits/rejected": -0.23961055278778076,
"logps/chosen": -114.5505599975586,
"logps/rejected": -468.49609375,
"loss": 0.8084,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.199494361877441,
"rewards/margins": 10.519908905029297,
"rewards/rejected": -4.320415019989014,
"step": 5620
},
{
"epoch": 1.9876412662745357,
"grad_norm": 3.907140951530193,
"learning_rate": 5.242195999421995e-10,
"logits/chosen": -0.8031083941459656,
"logits/rejected": -0.2381582111120224,
"logps/chosen": -110.63336181640625,
"logps/rejected": -468.484375,
"loss": 0.7892,
"rewards/accuracies": 1.0,
"rewards/chosen": 6.1121954917907715,
"rewards/margins": 10.432402610778809,
"rewards/rejected": -4.320207118988037,
"step": 5640
},
{
"epoch": 1.9946908113585795,
"grad_norm": 3.7397525668561244,
"learning_rate": 1.1066737874210199e-10,
"logits/chosen": -0.7896060943603516,
"logits/rejected": -0.23894786834716797,
"logps/chosen": -103.44597625732422,
"logps/rejected": -468.4378967285156,
"loss": 0.746,
"rewards/accuracies": 1.0,
"rewards/chosen": 5.9833083152771,
"rewards/margins": 10.302966117858887,
"rewards/rejected": -4.319657802581787,
"step": 5660
},
{
"epoch": 2.0,
"step": 5676,
"total_flos": 60766247190528.0,
"train_loss": 0.8989143536240387,
"train_runtime": 34286.54,
"train_samples_per_second": 5.296,
"train_steps_per_second": 0.166
}
],
"logging_steps": 20,
"max_steps": 5676,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 60766247190528.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}