dpo_longstep_3_4_4 / trainer_state.json
derko83's picture
Upload folder using huggingface_hub
a78a799 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.0,
"eval_steps": 500,
"global_step": 5972,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.033489618218352314,
"grad_norm": 39.56058883666992,
"learning_rate": 4.375e-08,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -175.2135009765625,
"logps/rejected": -218.9482421875,
"loss": 0.6927,
"rewards/accuracies": 0.41499999165534973,
"rewards/chosen": 0.0009201900684274733,
"rewards/margins": 0.0012563117779791355,
"rewards/rejected": -0.00033612194238230586,
"step": 50
},
{
"epoch": 0.06697923643670463,
"grad_norm": 55.64208984375,
"learning_rate": 8.839285714285714e-08,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -179.50753784179688,
"logps/rejected": -224.6807403564453,
"loss": 0.6932,
"rewards/accuracies": 0.4087499976158142,
"rewards/chosen": -0.0016289422055706382,
"rewards/margins": 0.0002692897687666118,
"rewards/rejected": -0.001898231916129589,
"step": 100
},
{
"epoch": 0.10046885465505694,
"grad_norm": 48.16231918334961,
"learning_rate": 1.3303571428571427e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -164.7879638671875,
"logps/rejected": -219.03224182128906,
"loss": 0.6925,
"rewards/accuracies": 0.4325000047683716,
"rewards/chosen": -0.0006416282267309725,
"rewards/margins": 0.0016548261046409607,
"rewards/rejected": -0.0022964540403336287,
"step": 150
},
{
"epoch": 0.13395847287340926,
"grad_norm": 45.87161636352539,
"learning_rate": 1.776785714285714e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -176.67550659179688,
"logps/rejected": -221.42312622070312,
"loss": 0.6933,
"rewards/accuracies": 0.4050000011920929,
"rewards/chosen": -0.003886653808876872,
"rewards/margins": 8.521832205587998e-05,
"rewards/rejected": -0.003971872851252556,
"step": 200
},
{
"epoch": 0.16744809109176156,
"grad_norm": 34.68981170654297,
"learning_rate": 2.2232142857142856e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -178.92369079589844,
"logps/rejected": -222.72586059570312,
"loss": 0.6904,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -0.004237725865095854,
"rewards/margins": 0.005978057160973549,
"rewards/rejected": -0.010215784423053265,
"step": 250
},
{
"epoch": 0.20093770931011387,
"grad_norm": 45.68313217163086,
"learning_rate": 2.669642857142857e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -176.19508361816406,
"logps/rejected": -220.88339233398438,
"loss": 0.6866,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.010833066888153553,
"rewards/margins": 0.014277225360274315,
"rewards/rejected": -0.025110295042395592,
"step": 300
},
{
"epoch": 0.23442732752846618,
"grad_norm": 45.355247497558594,
"learning_rate": 3.1160714285714285e-07,
"logits/chosen": NaN,
"logits/rejected": -1.2874314785003662,
"logps/chosen": -172.1802520751953,
"logps/rejected": -222.34373474121094,
"loss": 0.6817,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.022201180458068848,
"rewards/margins": 0.024774856865406036,
"rewards/rejected": -0.04697604104876518,
"step": 350
},
{
"epoch": 0.2679169457468185,
"grad_norm": 44.401466369628906,
"learning_rate": 3.5625e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -171.78598022460938,
"logps/rejected": -225.00076293945312,
"loss": 0.6694,
"rewards/accuracies": 0.5525000095367432,
"rewards/chosen": -0.04056182876229286,
"rewards/margins": 0.05375281721353531,
"rewards/rejected": -0.09431464225053787,
"step": 400
},
{
"epoch": 0.3014065639651708,
"grad_norm": 42.0828742980957,
"learning_rate": 4.008928571428571e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -179.93777465820312,
"logps/rejected": -225.8251953125,
"loss": 0.6615,
"rewards/accuracies": 0.5262500047683716,
"rewards/chosen": -0.08230926841497421,
"rewards/margins": 0.07786127924919128,
"rewards/rejected": -0.1601705402135849,
"step": 450
},
{
"epoch": 0.33489618218352313,
"grad_norm": 37.493553161621094,
"learning_rate": 4.455357142857143e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -177.60545349121094,
"logps/rejected": -223.0897674560547,
"loss": 0.6547,
"rewards/accuracies": 0.5099999904632568,
"rewards/chosen": -0.12488727271556854,
"rewards/margins": 0.10414745658636093,
"rewards/rejected": -0.22903470695018768,
"step": 500
},
{
"epoch": 0.3683858004018754,
"grad_norm": 42.67152404785156,
"learning_rate": 4.901785714285714e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -173.47787475585938,
"logps/rejected": -232.50518798828125,
"loss": 0.631,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.15443742275238037,
"rewards/margins": 0.18413911759853363,
"rewards/rejected": -0.3385765552520752,
"step": 550
},
{
"epoch": 0.40187541862022774,
"grad_norm": 32.57563018798828,
"learning_rate": 5.348214285714285e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -181.27125549316406,
"logps/rejected": -241.04226684570312,
"loss": 0.6242,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.2177175134420395,
"rewards/margins": 0.22946205735206604,
"rewards/rejected": -0.4471796154975891,
"step": 600
},
{
"epoch": 0.43536503683858,
"grad_norm": 47.70533752441406,
"learning_rate": 5.794642857142857e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -195.95242309570312,
"logps/rejected": -240.94540405273438,
"loss": 0.6072,
"rewards/accuracies": 0.5475000143051147,
"rewards/chosen": -0.31218427419662476,
"rewards/margins": 0.3060773015022278,
"rewards/rejected": -0.6182616353034973,
"step": 650
},
{
"epoch": 0.46885465505693236,
"grad_norm": 55.132423400878906,
"learning_rate": 6.241071428571429e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -187.1224365234375,
"logps/rejected": -239.97381591796875,
"loss": 0.598,
"rewards/accuracies": 0.5475000143051147,
"rewards/chosen": -0.38468390703201294,
"rewards/margins": 0.3886369466781616,
"rewards/rejected": -0.7733209133148193,
"step": 700
},
{
"epoch": 0.5023442732752846,
"grad_norm": 35.69628143310547,
"learning_rate": 6.6875e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -181.9203643798828,
"logps/rejected": -238.3597412109375,
"loss": 0.6054,
"rewards/accuracies": 0.5350000262260437,
"rewards/chosen": -0.4038671851158142,
"rewards/margins": 0.38458195328712463,
"rewards/rejected": -0.7884491086006165,
"step": 750
},
{
"epoch": 0.535833891493637,
"grad_norm": 52.34265899658203,
"learning_rate": 7.133928571428571e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -173.30198669433594,
"logps/rejected": -238.93728637695312,
"loss": 0.5574,
"rewards/accuracies": 0.5899999737739563,
"rewards/chosen": -0.4324275553226471,
"rewards/margins": 0.5758498907089233,
"rewards/rejected": -1.008277416229248,
"step": 800
},
{
"epoch": 0.5693235097119893,
"grad_norm": 50.31780242919922,
"learning_rate": 7.580357142857143e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -177.4162139892578,
"logps/rejected": -239.99441528320312,
"loss": 0.5693,
"rewards/accuracies": 0.5762500166893005,
"rewards/chosen": -0.4396001696586609,
"rewards/margins": 0.5518670678138733,
"rewards/rejected": -0.991467297077179,
"step": 850
},
{
"epoch": 0.6028131279303416,
"grad_norm": 35.81449508666992,
"learning_rate": 7.995271867612292e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -187.32102966308594,
"logps/rejected": -241.9414520263672,
"loss": 0.5773,
"rewards/accuracies": 0.5774999856948853,
"rewards/chosen": -0.5533062219619751,
"rewards/margins": 0.5976377129554749,
"rewards/rejected": -1.1509439945220947,
"step": 900
},
{
"epoch": 0.6363027461486939,
"grad_norm": 37.500022888183594,
"learning_rate": 7.916469661150512e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -187.38853454589844,
"logps/rejected": -238.58958435058594,
"loss": 0.5608,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.5429355502128601,
"rewards/margins": 0.6483522057533264,
"rewards/rejected": -1.1912877559661865,
"step": 950
},
{
"epoch": 0.6697923643670463,
"grad_norm": 45.89781188964844,
"learning_rate": 7.837667454688732e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -175.39967346191406,
"logps/rejected": -242.6263427734375,
"loss": 0.5456,
"rewards/accuracies": 0.5824999809265137,
"rewards/chosen": -0.546024739742279,
"rewards/margins": 0.7407156229019165,
"rewards/rejected": -1.2867404222488403,
"step": 1000
},
{
"epoch": 0.7032819825853985,
"grad_norm": 30.887229919433594,
"learning_rate": 7.75886524822695e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -180.49740600585938,
"logps/rejected": -256.4322509765625,
"loss": 0.5166,
"rewards/accuracies": 0.6274999976158142,
"rewards/chosen": -0.6010158658027649,
"rewards/margins": 0.8881167769432068,
"rewards/rejected": -1.4891326427459717,
"step": 1050
},
{
"epoch": 0.7367716008037508,
"grad_norm": 54.525856018066406,
"learning_rate": 7.680063041765169e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -169.29653930664062,
"logps/rejected": -241.8204345703125,
"loss": 0.5213,
"rewards/accuracies": 0.5849999785423279,
"rewards/chosen": -0.6621356010437012,
"rewards/margins": 0.8917463421821594,
"rewards/rejected": -1.553882122039795,
"step": 1100
},
{
"epoch": 0.7702612190221031,
"grad_norm": 37.80088424682617,
"learning_rate": 7.601260835303388e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -183.52398681640625,
"logps/rejected": -246.53189086914062,
"loss": 0.5385,
"rewards/accuracies": 0.5950000286102295,
"rewards/chosen": -0.6485376954078674,
"rewards/margins": 0.8500573039054871,
"rewards/rejected": -1.4985949993133545,
"step": 1150
},
{
"epoch": 0.8037508372404555,
"grad_norm": 38.089324951171875,
"learning_rate": 7.522458628841607e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -191.8751220703125,
"logps/rejected": -272.0120849609375,
"loss": 0.5175,
"rewards/accuracies": 0.6162499785423279,
"rewards/chosen": -0.6311337351799011,
"rewards/margins": 0.9966024160385132,
"rewards/rejected": -1.627736210823059,
"step": 1200
},
{
"epoch": 0.8372404554588078,
"grad_norm": 39.7374153137207,
"learning_rate": 7.443656422379827e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -190.6376190185547,
"logps/rejected": -257.7156982421875,
"loss": 0.5304,
"rewards/accuracies": 0.5975000262260437,
"rewards/chosen": -0.7539389133453369,
"rewards/margins": 1.0195887088775635,
"rewards/rejected": -1.7735275030136108,
"step": 1250
},
{
"epoch": 0.87073007367716,
"grad_norm": 30.075101852416992,
"learning_rate": 7.364854215918045e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -188.0233154296875,
"logps/rejected": -246.95262145996094,
"loss": 0.5185,
"rewards/accuracies": 0.6162499785423279,
"rewards/chosen": -0.6855795383453369,
"rewards/margins": 0.9803519248962402,
"rewards/rejected": -1.6659313440322876,
"step": 1300
},
{
"epoch": 0.9042196918955124,
"grad_norm": 39.85395431518555,
"learning_rate": 7.286052009456264e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -182.7262725830078,
"logps/rejected": -253.09869384765625,
"loss": 0.5237,
"rewards/accuracies": 0.6012499928474426,
"rewards/chosen": -0.606716513633728,
"rewards/margins": 0.8856968879699707,
"rewards/rejected": -1.4924132823944092,
"step": 1350
},
{
"epoch": 0.9377093101138647,
"grad_norm": 66.29072570800781,
"learning_rate": 7.207249802994484e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -186.25836181640625,
"logps/rejected": -241.17901611328125,
"loss": 0.5323,
"rewards/accuracies": 0.6012499928474426,
"rewards/chosen": -0.6343129873275757,
"rewards/margins": 0.9016135931015015,
"rewards/rejected": -1.5359266996383667,
"step": 1400
},
{
"epoch": 0.971198928332217,
"grad_norm": 36.76164245605469,
"learning_rate": 7.128447596532703e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -182.4203643798828,
"logps/rejected": -249.50076293945312,
"loss": 0.5378,
"rewards/accuracies": 0.5849999785423279,
"rewards/chosen": -0.5479399561882019,
"rewards/margins": 0.8743146657943726,
"rewards/rejected": -1.4222546815872192,
"step": 1450
},
{
"epoch": 1.0046885465505693,
"grad_norm": 53.488407135009766,
"learning_rate": 7.049645390070921e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -182.12411499023438,
"logps/rejected": -253.10464477539062,
"loss": 0.4944,
"rewards/accuracies": 0.6324999928474426,
"rewards/chosen": -0.5486608147621155,
"rewards/margins": 1.0833656787872314,
"rewards/rejected": -1.6320266723632812,
"step": 1500
},
{
"epoch": 1.0381781647689217,
"grad_norm": 49.80898666381836,
"learning_rate": 6.97084318360914e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -186.8605499267578,
"logps/rejected": -265.5860595703125,
"loss": 0.4357,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.6526676416397095,
"rewards/margins": 1.3229660987854004,
"rewards/rejected": -1.9756335020065308,
"step": 1550
},
{
"epoch": 1.0716677829872738,
"grad_norm": 39.074790954589844,
"learning_rate": 6.89204097714736e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -185.23265075683594,
"logps/rejected": -257.21270751953125,
"loss": 0.455,
"rewards/accuracies": 0.6762499809265137,
"rewards/chosen": -0.6439327001571655,
"rewards/margins": 1.2369264364242554,
"rewards/rejected": -1.8808592557907104,
"step": 1600
},
{
"epoch": 1.1051574012056262,
"grad_norm": 24.9747257232666,
"learning_rate": 6.813238770685579e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -193.9910125732422,
"logps/rejected": -257.0543518066406,
"loss": 0.4256,
"rewards/accuracies": 0.6912500262260437,
"rewards/chosen": -0.5875076055526733,
"rewards/margins": 1.313684344291687,
"rewards/rejected": -1.9011921882629395,
"step": 1650
},
{
"epoch": 1.1386470194239786,
"grad_norm": 27.623506546020508,
"learning_rate": 6.734436564223798e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -192.95932006835938,
"logps/rejected": -246.83566284179688,
"loss": 0.451,
"rewards/accuracies": 0.6650000214576721,
"rewards/chosen": -0.7364577651023865,
"rewards/margins": 1.2061336040496826,
"rewards/rejected": -1.9425911903381348,
"step": 1700
},
{
"epoch": 1.1721366376423308,
"grad_norm": 39.15848922729492,
"learning_rate": 6.655634357762017e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -182.35861206054688,
"logps/rejected": -249.7042236328125,
"loss": 0.4414,
"rewards/accuracies": 0.6700000166893005,
"rewards/chosen": -0.6322548985481262,
"rewards/margins": 1.4189176559448242,
"rewards/rejected": -2.0511724948883057,
"step": 1750
},
{
"epoch": 1.2056262558606832,
"grad_norm": 46.19662857055664,
"learning_rate": 6.576832151300236e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -186.02719116210938,
"logps/rejected": -266.9911193847656,
"loss": 0.4244,
"rewards/accuracies": 0.7012500166893005,
"rewards/chosen": -0.6425164937973022,
"rewards/margins": 1.4480139017105103,
"rewards/rejected": -2.0905306339263916,
"step": 1800
},
{
"epoch": 1.2391158740790356,
"grad_norm": 25.113298416137695,
"learning_rate": 6.498029944838455e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -178.42767333984375,
"logps/rejected": -260.0691223144531,
"loss": 0.4309,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.6952015161514282,
"rewards/margins": 1.4248483180999756,
"rewards/rejected": -2.1200499534606934,
"step": 1850
},
{
"epoch": 1.2726054922973877,
"grad_norm": 44.80360412597656,
"learning_rate": 6.419227738376675e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -186.84951782226562,
"logps/rejected": -257.0547790527344,
"loss": 0.4364,
"rewards/accuracies": 0.6762499809265137,
"rewards/chosen": -0.8910938501358032,
"rewards/margins": 1.557470679283142,
"rewards/rejected": -2.4485647678375244,
"step": 1900
},
{
"epoch": 1.3060951105157401,
"grad_norm": 28.531023025512695,
"learning_rate": 6.340425531914892e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -185.15121459960938,
"logps/rejected": -261.5601501464844,
"loss": 0.4392,
"rewards/accuracies": 0.6725000143051147,
"rewards/chosen": -0.7338109612464905,
"rewards/margins": 1.4515758752822876,
"rewards/rejected": -2.185386896133423,
"step": 1950
},
{
"epoch": 1.3395847287340925,
"grad_norm": 28.397706985473633,
"learning_rate": 6.261623325453112e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -184.82705688476562,
"logps/rejected": -271.1513366699219,
"loss": 0.4268,
"rewards/accuracies": 0.6800000071525574,
"rewards/chosen": -0.7382247447967529,
"rewards/margins": 1.5511670112609863,
"rewards/rejected": -2.2893919944763184,
"step": 2000
},
{
"epoch": 1.3730743469524447,
"grad_norm": 35.965423583984375,
"learning_rate": 6.182821118991332e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -192.85757446289062,
"logps/rejected": -273.0481262207031,
"loss": 0.4352,
"rewards/accuracies": 0.6850000023841858,
"rewards/chosen": -0.8608375787734985,
"rewards/margins": 1.4874813556671143,
"rewards/rejected": -2.3483190536499023,
"step": 2050
},
{
"epoch": 1.406563965170797,
"grad_norm": 38.594947814941406,
"learning_rate": 6.10401891252955e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -195.38641357421875,
"logps/rejected": -278.737548828125,
"loss": 0.4166,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.7014285922050476,
"rewards/margins": 1.5582598447799683,
"rewards/rejected": -2.259688377380371,
"step": 2100
},
{
"epoch": 1.4400535833891492,
"grad_norm": 35.8038215637207,
"learning_rate": 6.025216706067769e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -188.76324462890625,
"logps/rejected": -275.5191650390625,
"loss": 0.404,
"rewards/accuracies": 0.7087500095367432,
"rewards/chosen": -0.6745861172676086,
"rewards/margins": 1.6506869792938232,
"rewards/rejected": -2.325273036956787,
"step": 2150
},
{
"epoch": 1.4735432016075016,
"grad_norm": 29.62619972229004,
"learning_rate": 5.946414499605989e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -187.25250244140625,
"logps/rejected": -272.7938232421875,
"loss": 0.398,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.6346092224121094,
"rewards/margins": 1.7176785469055176,
"rewards/rejected": -2.352287769317627,
"step": 2200
},
{
"epoch": 1.507032819825854,
"grad_norm": 21.93035316467285,
"learning_rate": 5.867612293144208e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -191.32916259765625,
"logps/rejected": -260.4537048339844,
"loss": 0.4424,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.6742202639579773,
"rewards/margins": 1.4148352146148682,
"rewards/rejected": -2.0890555381774902,
"step": 2250
},
{
"epoch": 1.5405224380442064,
"grad_norm": 15.887839317321777,
"learning_rate": 5.788810086682427e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -190.4200439453125,
"logps/rejected": -274.1068420410156,
"loss": 0.4002,
"rewards/accuracies": 0.6837499737739563,
"rewards/chosen": -0.6341544985771179,
"rewards/margins": 1.6712123155593872,
"rewards/rejected": -2.3053667545318604,
"step": 2300
},
{
"epoch": 1.5740120562625586,
"grad_norm": 46.13706588745117,
"learning_rate": 5.710007880220646e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -184.80099487304688,
"logps/rejected": -268.6807861328125,
"loss": 0.4078,
"rewards/accuracies": 0.6899999976158142,
"rewards/chosen": -0.7255478501319885,
"rewards/margins": 1.6650432348251343,
"rewards/rejected": -2.3905911445617676,
"step": 2350
},
{
"epoch": 1.607501674480911,
"grad_norm": 41.01249313354492,
"learning_rate": 5.631205673758865e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -189.23605346679688,
"logps/rejected": -268.6483154296875,
"loss": 0.4133,
"rewards/accuracies": 0.6787499785423279,
"rewards/chosen": -0.8096724152565002,
"rewards/margins": 1.5884754657745361,
"rewards/rejected": -2.3981478214263916,
"step": 2400
},
{
"epoch": 1.6409912926992631,
"grad_norm": 44.18216323852539,
"learning_rate": 5.552403467297084e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -206.16220092773438,
"logps/rejected": -275.8337097167969,
"loss": 0.42,
"rewards/accuracies": 0.6887500286102295,
"rewards/chosen": -0.776029109954834,
"rewards/margins": 1.5833215713500977,
"rewards/rejected": -2.3593506813049316,
"step": 2450
},
{
"epoch": 1.6744809109176155,
"grad_norm": 38.77958679199219,
"learning_rate": 5.473601260835303e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -191.59031677246094,
"logps/rejected": -268.516357421875,
"loss": 0.4145,
"rewards/accuracies": 0.6850000023841858,
"rewards/chosen": -0.6448932886123657,
"rewards/margins": 1.565537691116333,
"rewards/rejected": -2.210430860519409,
"step": 2500
},
{
"epoch": 1.707970529135968,
"grad_norm": 36.07415008544922,
"learning_rate": 5.394799054373523e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -183.8679656982422,
"logps/rejected": -262.7795104980469,
"loss": 0.3909,
"rewards/accuracies": 0.7037500143051147,
"rewards/chosen": -0.6056129336357117,
"rewards/margins": 1.6474745273590088,
"rewards/rejected": -2.2530875205993652,
"step": 2550
},
{
"epoch": 1.7414601473543203,
"grad_norm": 59.49274826049805,
"learning_rate": 5.315996847911741e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -186.5425567626953,
"logps/rejected": -271.7103576660156,
"loss": 0.3796,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.7344449758529663,
"rewards/margins": 1.7059468030929565,
"rewards/rejected": -2.440391778945923,
"step": 2600
},
{
"epoch": 1.7749497655726725,
"grad_norm": 41.25373077392578,
"learning_rate": 5.23719464144996e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -186.00929260253906,
"logps/rejected": -268.0653076171875,
"loss": 0.3793,
"rewards/accuracies": 0.7099999785423279,
"rewards/chosen": -0.7967619895935059,
"rewards/margins": 1.9842884540557861,
"rewards/rejected": -2.781050443649292,
"step": 2650
},
{
"epoch": 1.8084393837910246,
"grad_norm": 37.203304290771484,
"learning_rate": 5.15839243498818e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -187.30181884765625,
"logps/rejected": -272.9234313964844,
"loss": 0.4057,
"rewards/accuracies": 0.6825000047683716,
"rewards/chosen": -0.7267603874206543,
"rewards/margins": 1.7311309576034546,
"rewards/rejected": -2.4578914642333984,
"step": 2700
},
{
"epoch": 1.841929002009377,
"grad_norm": 30.321313858032227,
"learning_rate": 5.079590228526398e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -189.3111114501953,
"logps/rejected": -270.3074645996094,
"loss": 0.4081,
"rewards/accuracies": 0.6800000071525574,
"rewards/chosen": -0.751646101474762,
"rewards/margins": 1.640588641166687,
"rewards/rejected": -2.3922348022460938,
"step": 2750
},
{
"epoch": 1.8754186202277294,
"grad_norm": 47.42466735839844,
"learning_rate": 5.000788022064617e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -179.08119201660156,
"logps/rejected": -277.5658264160156,
"loss": 0.3837,
"rewards/accuracies": 0.6949999928474426,
"rewards/chosen": -0.6461160182952881,
"rewards/margins": 1.8302369117736816,
"rewards/rejected": -2.476353168487549,
"step": 2800
},
{
"epoch": 1.9089082384460818,
"grad_norm": 37.78199005126953,
"learning_rate": 4.921985815602837e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -195.4869384765625,
"logps/rejected": -276.9580383300781,
"loss": 0.3995,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.7339184284210205,
"rewards/margins": 1.7630858421325684,
"rewards/rejected": -2.4970040321350098,
"step": 2850
},
{
"epoch": 1.942397856664434,
"grad_norm": 42.2109375,
"learning_rate": 4.843183609141055e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -199.12640380859375,
"logps/rejected": -275.89630126953125,
"loss": 0.4105,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.6928785443305969,
"rewards/margins": 1.7767040729522705,
"rewards/rejected": -2.4695825576782227,
"step": 2900
},
{
"epoch": 1.9758874748827864,
"grad_norm": 50.837059020996094,
"learning_rate": 4.764381402679275e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -187.2418212890625,
"logps/rejected": -285.7373046875,
"loss": 0.3963,
"rewards/accuracies": 0.7012500166893005,
"rewards/chosen": -0.7950295805931091,
"rewards/margins": 1.9002233743667603,
"rewards/rejected": -2.6952526569366455,
"step": 2950
},
{
"epoch": 2.0093770931011385,
"grad_norm": 24.105684280395508,
"learning_rate": 4.685579196217494e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -175.44351196289062,
"logps/rejected": -268.713134765625,
"loss": 0.3725,
"rewards/accuracies": 0.7087500095367432,
"rewards/chosen": -0.7846677303314209,
"rewards/margins": 1.9885753393173218,
"rewards/rejected": -2.7732431888580322,
"step": 3000
},
{
"epoch": 2.042866711319491,
"grad_norm": 21.2971248626709,
"learning_rate": 4.606776989755713e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -189.33584594726562,
"logps/rejected": -282.147705078125,
"loss": 0.3284,
"rewards/accuracies": 0.7512500286102295,
"rewards/chosen": -0.6468074917793274,
"rewards/margins": 2.03031849861145,
"rewards/rejected": -2.677126169204712,
"step": 3050
},
{
"epoch": 2.0763563295378433,
"grad_norm": 13.780171394348145,
"learning_rate": 4.527974783293932e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -191.3354949951172,
"logps/rejected": -282.95489501953125,
"loss": 0.3183,
"rewards/accuracies": 0.7549999952316284,
"rewards/chosen": -0.7194473147392273,
"rewards/margins": 2.1648471355438232,
"rewards/rejected": -2.884294033050537,
"step": 3100
},
{
"epoch": 2.1098459477561957,
"grad_norm": 27.040647506713867,
"learning_rate": 4.449172576832151e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -187.97789001464844,
"logps/rejected": -267.7035217285156,
"loss": 0.3298,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.7726535797119141,
"rewards/margins": 2.0831313133239746,
"rewards/rejected": -2.8557848930358887,
"step": 3150
},
{
"epoch": 2.1433355659745477,
"grad_norm": 52.36626052856445,
"learning_rate": 4.37037037037037e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -187.05853271484375,
"logps/rejected": -278.2367248535156,
"loss": 0.3381,
"rewards/accuracies": 0.7287499904632568,
"rewards/chosen": -0.7423791289329529,
"rewards/margins": 2.1862621307373047,
"rewards/rejected": -2.9286410808563232,
"step": 3200
},
{
"epoch": 2.1768251841929,
"grad_norm": 44.21054458618164,
"learning_rate": 4.2915681639085896e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -194.27188110351562,
"logps/rejected": -286.78887939453125,
"loss": 0.3391,
"rewards/accuracies": 0.7212499976158142,
"rewards/chosen": -0.7143966555595398,
"rewards/margins": 2.098987579345703,
"rewards/rejected": -2.8133840560913086,
"step": 3250
},
{
"epoch": 2.2103148024112524,
"grad_norm": 55.730262756347656,
"learning_rate": 4.212765957446808e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -184.7841796875,
"logps/rejected": -289.6191101074219,
"loss": 0.3417,
"rewards/accuracies": 0.7174999713897705,
"rewards/chosen": -0.747239351272583,
"rewards/margins": 2.2462081909179688,
"rewards/rejected": -2.993447780609131,
"step": 3300
},
{
"epoch": 2.243804420629605,
"grad_norm": 34.05455017089844,
"learning_rate": 4.1339637509850275e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -188.21963500976562,
"logps/rejected": -282.9302062988281,
"loss": 0.3288,
"rewards/accuracies": 0.7275000214576721,
"rewards/chosen": -0.8579057455062866,
"rewards/margins": 2.2049171924591064,
"rewards/rejected": -3.0628225803375244,
"step": 3350
},
{
"epoch": 2.2772940388479572,
"grad_norm": 32.748878479003906,
"learning_rate": 4.0551615445232467e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -194.06427001953125,
"logps/rejected": -288.862548828125,
"loss": 0.3154,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.9077091217041016,
"rewards/margins": 2.30654239654541,
"rewards/rejected": -3.2142515182495117,
"step": 3400
},
{
"epoch": 2.3107836570663096,
"grad_norm": 26.387239456176758,
"learning_rate": 3.976359338061466e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -184.67759704589844,
"logps/rejected": -284.309326171875,
"loss": 0.3273,
"rewards/accuracies": 0.7287499904632568,
"rewards/chosen": -0.9315968155860901,
"rewards/margins": 2.26393985748291,
"rewards/rejected": -3.1955366134643555,
"step": 3450
},
{
"epoch": 2.3442732752846616,
"grad_norm": 26.215883255004883,
"learning_rate": 3.8975571315996845e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -193.22149658203125,
"logps/rejected": -286.2984313964844,
"loss": 0.3245,
"rewards/accuracies": 0.7200000286102295,
"rewards/chosen": -0.839231014251709,
"rewards/margins": 2.3949248790740967,
"rewards/rejected": -3.2341556549072266,
"step": 3500
},
{
"epoch": 2.377762893503014,
"grad_norm": 45.05733108520508,
"learning_rate": 3.8187549251379037e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -192.80043029785156,
"logps/rejected": -286.0914306640625,
"loss": 0.3323,
"rewards/accuracies": 0.7162500023841858,
"rewards/chosen": -0.8909017443656921,
"rewards/margins": 2.335562229156494,
"rewards/rejected": -3.226464033126831,
"step": 3550
},
{
"epoch": 2.4112525117213663,
"grad_norm": 23.256881713867188,
"learning_rate": 3.739952718676123e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -197.46514892578125,
"logps/rejected": -286.53509521484375,
"loss": 0.3074,
"rewards/accuracies": 0.7787500023841858,
"rewards/chosen": -0.8636592030525208,
"rewards/margins": 2.3110320568084717,
"rewards/rejected": -3.1746912002563477,
"step": 3600
},
{
"epoch": 2.4447421299397187,
"grad_norm": 29.816986083984375,
"learning_rate": 3.661150512214342e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -186.59291076660156,
"logps/rejected": -285.0599365234375,
"loss": 0.3406,
"rewards/accuracies": 0.7225000262260437,
"rewards/chosen": -0.8795223236083984,
"rewards/margins": 2.274043083190918,
"rewards/rejected": -3.1535654067993164,
"step": 3650
},
{
"epoch": 2.478231748158071,
"grad_norm": 58.20719528198242,
"learning_rate": 3.582348305752561e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -193.24021911621094,
"logps/rejected": -280.94549560546875,
"loss": 0.351,
"rewards/accuracies": 0.7300000190734863,
"rewards/chosen": -0.9534088373184204,
"rewards/margins": 2.271278142929077,
"rewards/rejected": -3.224686861038208,
"step": 3700
},
{
"epoch": 2.511721366376423,
"grad_norm": 61.28821563720703,
"learning_rate": 3.50354609929078e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -182.4898681640625,
"logps/rejected": -285.96759033203125,
"loss": 0.3099,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.9077944755554199,
"rewards/margins": 2.507885217666626,
"rewards/rejected": -3.4156792163848877,
"step": 3750
},
{
"epoch": 2.5452109845947755,
"grad_norm": 31.814802169799805,
"learning_rate": 3.424743892828999e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -192.01673889160156,
"logps/rejected": -291.91192626953125,
"loss": 0.3273,
"rewards/accuracies": 0.7487499713897705,
"rewards/chosen": -1.0059828758239746,
"rewards/margins": 2.3609018325805664,
"rewards/rejected": -3.366884469985962,
"step": 3800
},
{
"epoch": 2.578700602813128,
"grad_norm": 27.818708419799805,
"learning_rate": 3.345941686367218e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -193.6220703125,
"logps/rejected": -282.2691345214844,
"loss": 0.3249,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.8329002261161804,
"rewards/margins": 2.3765745162963867,
"rewards/rejected": -3.209474802017212,
"step": 3850
},
{
"epoch": 2.6121902210314802,
"grad_norm": 43.8161735534668,
"learning_rate": 3.2671394799054374e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -188.08872985839844,
"logps/rejected": -288.9317932128906,
"loss": 0.3261,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -0.8284339308738708,
"rewards/margins": 2.3834288120269775,
"rewards/rejected": -3.211862564086914,
"step": 3900
},
{
"epoch": 2.6456798392498326,
"grad_norm": 40.35237503051758,
"learning_rate": 3.188337273443656e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -188.78111267089844,
"logps/rejected": -292.10601806640625,
"loss": 0.3219,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -0.8962965607643127,
"rewards/margins": 2.3189728260040283,
"rewards/rejected": -3.2152698040008545,
"step": 3950
},
{
"epoch": 2.679169457468185,
"grad_norm": 51.2871208190918,
"learning_rate": 3.1095350669818753e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -197.29296875,
"logps/rejected": -297.2159729003906,
"loss": 0.3349,
"rewards/accuracies": 0.7425000071525574,
"rewards/chosen": -0.9539132118225098,
"rewards/margins": 2.459329605102539,
"rewards/rejected": -3.4132425785064697,
"step": 4000
},
{
"epoch": 2.7126590756865374,
"grad_norm": 26.952434539794922,
"learning_rate": 3.0307328605200945e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -197.95140075683594,
"logps/rejected": -294.74676513671875,
"loss": 0.3164,
"rewards/accuracies": 0.7425000071525574,
"rewards/chosen": -0.8712408542633057,
"rewards/margins": 2.3473589420318604,
"rewards/rejected": -3.218599796295166,
"step": 4050
},
{
"epoch": 2.7461486939048894,
"grad_norm": 23.623044967651367,
"learning_rate": 2.9519306540583136e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -188.4278106689453,
"logps/rejected": -270.7493896484375,
"loss": 0.3482,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -0.8394105434417725,
"rewards/margins": 2.1500954627990723,
"rewards/rejected": -2.989506244659424,
"step": 4100
},
{
"epoch": 2.7796383121232418,
"grad_norm": 33.7256965637207,
"learning_rate": 2.8731284475965323e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -190.12367248535156,
"logps/rejected": -286.8393859863281,
"loss": 0.3118,
"rewards/accuracies": 0.7450000047683716,
"rewards/chosen": -0.8797828555107117,
"rewards/margins": 2.488528251647949,
"rewards/rejected": -3.3683111667633057,
"step": 4150
},
{
"epoch": 2.813127930341594,
"grad_norm": 26.706951141357422,
"learning_rate": 2.7943262411347515e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -190.7408447265625,
"logps/rejected": -285.0479736328125,
"loss": 0.3168,
"rewards/accuracies": 0.7350000143051147,
"rewards/chosen": -0.8466379642486572,
"rewards/margins": 2.4300098419189453,
"rewards/rejected": -3.2766480445861816,
"step": 4200
},
{
"epoch": 2.8466175485599465,
"grad_norm": 38.752418518066406,
"learning_rate": 2.7155240346729707e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -198.39134216308594,
"logps/rejected": -290.9305725097656,
"loss": 0.3135,
"rewards/accuracies": 0.7450000047683716,
"rewards/chosen": -0.9479385614395142,
"rewards/margins": 2.4617414474487305,
"rewards/rejected": -3.409679889678955,
"step": 4250
},
{
"epoch": 2.8801071667782985,
"grad_norm": 22.20810317993164,
"learning_rate": 2.63672182821119e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -202.04925537109375,
"logps/rejected": -298.950927734375,
"loss": 0.2977,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -0.897186815738678,
"rewards/margins": 2.3512582778930664,
"rewards/rejected": -3.2484447956085205,
"step": 4300
},
{
"epoch": 2.913596784996651,
"grad_norm": 66.3785171508789,
"learning_rate": 2.557919621749409e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -199.11300659179688,
"logps/rejected": -298.3018798828125,
"loss": 0.3053,
"rewards/accuracies": 0.7587500214576721,
"rewards/chosen": -0.9929912090301514,
"rewards/margins": 2.481295585632324,
"rewards/rejected": -3.4742870330810547,
"step": 4350
},
{
"epoch": 2.9470864032150033,
"grad_norm": 37.80686569213867,
"learning_rate": 2.4791174152876277e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -202.31398010253906,
"logps/rejected": -276.3575439453125,
"loss": 0.3206,
"rewards/accuracies": 0.7475000023841858,
"rewards/chosen": -0.9125861525535583,
"rewards/margins": 2.299701690673828,
"rewards/rejected": -3.2122879028320312,
"step": 4400
},
{
"epoch": 2.9805760214333556,
"grad_norm": 125.0142593383789,
"learning_rate": 2.4003152088258474e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -187.2313232421875,
"logps/rejected": -284.8965759277344,
"loss": 0.3186,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.8962329030036926,
"rewards/margins": 2.4796411991119385,
"rewards/rejected": -3.3758738040924072,
"step": 4450
},
{
"epoch": 3.014065639651708,
"grad_norm": 36.955318450927734,
"learning_rate": 2.321513002364066e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -191.8556671142578,
"logps/rejected": -289.2830810546875,
"loss": 0.3003,
"rewards/accuracies": 0.762499988079071,
"rewards/chosen": -0.953887403011322,
"rewards/margins": 2.597318410873413,
"rewards/rejected": -3.551206111907959,
"step": 4500
},
{
"epoch": 3.0475552578700604,
"grad_norm": 35.30141830444336,
"learning_rate": 2.242710795902285e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -201.59471130371094,
"logps/rejected": -296.01190185546875,
"loss": 0.2955,
"rewards/accuracies": 0.7574999928474426,
"rewards/chosen": -0.8759480118751526,
"rewards/margins": 2.536679744720459,
"rewards/rejected": -3.4126272201538086,
"step": 4550
},
{
"epoch": 3.081044876088413,
"grad_norm": 19.832714080810547,
"learning_rate": 2.163908589440504e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -187.59324645996094,
"logps/rejected": -301.8227233886719,
"loss": 0.2666,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.9135188460350037,
"rewards/margins": 2.812459707260132,
"rewards/rejected": -3.7259786128997803,
"step": 4600
},
{
"epoch": 3.1145344943067648,
"grad_norm": 20.465190887451172,
"learning_rate": 2.0851063829787233e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -188.68150329589844,
"logps/rejected": -292.4168701171875,
"loss": 0.2952,
"rewards/accuracies": 0.7337499856948853,
"rewards/chosen": -0.7880871295928955,
"rewards/margins": 2.723849296569824,
"rewards/rejected": -3.5119359493255615,
"step": 4650
},
{
"epoch": 3.148024112525117,
"grad_norm": 16.645957946777344,
"learning_rate": 2.0063041765169423e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -181.14376831054688,
"logps/rejected": -276.7168884277344,
"loss": 0.2973,
"rewards/accuracies": 0.7350000143051147,
"rewards/chosen": -0.8847752213478088,
"rewards/margins": 2.6361799240112305,
"rewards/rejected": -3.5209546089172363,
"step": 4700
},
{
"epoch": 3.1815137307434695,
"grad_norm": 16.746036529541016,
"learning_rate": 1.9275019700551615e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -191.2080078125,
"logps/rejected": -303.0468444824219,
"loss": 0.2812,
"rewards/accuracies": 0.7524999976158142,
"rewards/chosen": -0.9689957499504089,
"rewards/margins": 2.876345157623291,
"rewards/rejected": -3.845341205596924,
"step": 4750
},
{
"epoch": 3.215003348961822,
"grad_norm": 22.3815975189209,
"learning_rate": 1.8486997635933806e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -203.40260314941406,
"logps/rejected": -296.6044616699219,
"loss": 0.308,
"rewards/accuracies": 0.7475000023841858,
"rewards/chosen": -0.9666973352432251,
"rewards/margins": 2.647951364517212,
"rewards/rejected": -3.6146488189697266,
"step": 4800
},
{
"epoch": 3.2484929671801743,
"grad_norm": 22.031641006469727,
"learning_rate": 1.7698975571315996e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -184.9111328125,
"logps/rejected": -287.4691162109375,
"loss": 0.2826,
"rewards/accuracies": 0.7450000047683716,
"rewards/chosen": -0.9380254149436951,
"rewards/margins": 2.7892355918884277,
"rewards/rejected": -3.7272610664367676,
"step": 4850
},
{
"epoch": 3.2819825853985263,
"grad_norm": 31.574790954589844,
"learning_rate": 1.6910953506698187e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -191.68785095214844,
"logps/rejected": -300.5950622558594,
"loss": 0.273,
"rewards/accuracies": 0.7674999833106995,
"rewards/chosen": -0.9448862671852112,
"rewards/margins": 2.9543023109436035,
"rewards/rejected": -3.899188756942749,
"step": 4900
},
{
"epoch": 3.3154722036168787,
"grad_norm": 20.961929321289062,
"learning_rate": 1.612293144208038e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -189.8438720703125,
"logps/rejected": -291.8954162597656,
"loss": 0.2805,
"rewards/accuracies": 0.7475000023841858,
"rewards/chosen": -0.9848695397377014,
"rewards/margins": 2.715031385421753,
"rewards/rejected": -3.6999011039733887,
"step": 4950
},
{
"epoch": 3.348961821835231,
"grad_norm": 81.02938079833984,
"learning_rate": 1.5334909377462568e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -194.07749938964844,
"logps/rejected": -288.35015869140625,
"loss": 0.2876,
"rewards/accuracies": 0.7537500262260437,
"rewards/chosen": -1.0682913064956665,
"rewards/margins": 2.6784591674804688,
"rewards/rejected": -3.746750593185425,
"step": 5000
},
{
"epoch": 3.3824514400535834,
"grad_norm": 49.065887451171875,
"learning_rate": 1.454688731284476e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -203.08660888671875,
"logps/rejected": -315.59271240234375,
"loss": 0.2685,
"rewards/accuracies": 0.7699999809265137,
"rewards/chosen": -0.986056923866272,
"rewards/margins": 3.007080078125,
"rewards/rejected": -3.9931368827819824,
"step": 5050
},
{
"epoch": 3.415941058271936,
"grad_norm": 21.368894577026367,
"learning_rate": 1.375886524822695e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -198.26324462890625,
"logps/rejected": -298.6892395019531,
"loss": 0.2805,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -1.0937081575393677,
"rewards/margins": 2.7569918632507324,
"rewards/rejected": -3.8506996631622314,
"step": 5100
},
{
"epoch": 3.4494306764902882,
"grad_norm": 18.683109283447266,
"learning_rate": 1.2970843183609141e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -205.4937286376953,
"logps/rejected": -288.3065185546875,
"loss": 0.2852,
"rewards/accuracies": 0.7662500143051147,
"rewards/chosen": -1.0320967435836792,
"rewards/margins": 2.5120697021484375,
"rewards/rejected": -3.544166326522827,
"step": 5150
},
{
"epoch": 3.48292029470864,
"grad_norm": 38.7464599609375,
"learning_rate": 1.218282111899133e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -194.1973876953125,
"logps/rejected": -299.29559326171875,
"loss": 0.2871,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.104549527168274,
"rewards/margins": 2.853426218032837,
"rewards/rejected": -3.9579761028289795,
"step": 5200
},
{
"epoch": 3.5164099129269926,
"grad_norm": 7.350229740142822,
"learning_rate": 1.1394799054373522e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -201.8234405517578,
"logps/rejected": -290.8706970214844,
"loss": 0.3006,
"rewards/accuracies": 0.7387499809265137,
"rewards/chosen": -1.0995410680770874,
"rewards/margins": 2.636826992034912,
"rewards/rejected": -3.736368417739868,
"step": 5250
},
{
"epoch": 3.549899531145345,
"grad_norm": 19.686573028564453,
"learning_rate": 1.0606776989755713e-07,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -190.27627563476562,
"logps/rejected": -288.27923583984375,
"loss": 0.2949,
"rewards/accuracies": 0.7524999976158142,
"rewards/chosen": -1.0866522789001465,
"rewards/margins": 2.7148284912109375,
"rewards/rejected": -3.801480770111084,
"step": 5300
},
{
"epoch": 3.5833891493636973,
"grad_norm": 23.88344955444336,
"learning_rate": 9.818754925137903e-08,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -196.48269653320312,
"logps/rejected": -295.1553039550781,
"loss": 0.2976,
"rewards/accuracies": 0.7262499928474426,
"rewards/chosen": -1.0540306568145752,
"rewards/margins": 2.6128809452056885,
"rewards/rejected": -3.6669113636016846,
"step": 5350
},
{
"epoch": 3.6168787675820493,
"grad_norm": 28.22519874572754,
"learning_rate": 9.030732860520094e-08,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -196.87808227539062,
"logps/rejected": -294.7259216308594,
"loss": 0.2903,
"rewards/accuracies": 0.7487499713897705,
"rewards/chosen": -1.0297123193740845,
"rewards/margins": 2.7472548484802246,
"rewards/rejected": -3.7769670486450195,
"step": 5400
},
{
"epoch": 3.6503683858004017,
"grad_norm": 34.18745422363281,
"learning_rate": 8.242710795902284e-08,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -195.99404907226562,
"logps/rejected": -290.40350341796875,
"loss": 0.2871,
"rewards/accuracies": 0.7524999976158142,
"rewards/chosen": -1.050000786781311,
"rewards/margins": 2.8044447898864746,
"rewards/rejected": -3.8544461727142334,
"step": 5450
},
{
"epoch": 3.683858004018754,
"grad_norm": 47.866973876953125,
"learning_rate": 7.454688731284475e-08,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -200.01785278320312,
"logps/rejected": -314.4879150390625,
"loss": 0.2842,
"rewards/accuracies": 0.7637500166893005,
"rewards/chosen": -1.1343244314193726,
"rewards/margins": 2.814058780670166,
"rewards/rejected": -3.9483835697174072,
"step": 5500
},
{
"epoch": 3.7173476222371065,
"grad_norm": 42.807334899902344,
"learning_rate": 6.666666666666665e-08,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -195.06690979003906,
"logps/rejected": -304.86456298828125,
"loss": 0.2987,
"rewards/accuracies": 0.7262499928474426,
"rewards/chosen": -0.996014416217804,
"rewards/margins": 2.8309714794158936,
"rewards/rejected": -3.826986074447632,
"step": 5550
},
{
"epoch": 3.750837240455459,
"grad_norm": 43.85563278198242,
"learning_rate": 5.8786446020488567e-08,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -192.9456787109375,
"logps/rejected": -302.6964111328125,
"loss": 0.2717,
"rewards/accuracies": 0.7674999833106995,
"rewards/chosen": -0.9534288644790649,
"rewards/margins": 2.9733974933624268,
"rewards/rejected": -3.926826238632202,
"step": 5600
},
{
"epoch": 3.7843268586738112,
"grad_norm": 15.730545043945312,
"learning_rate": 5.090622537431047e-08,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -201.83468627929688,
"logps/rejected": -303.9621887207031,
"loss": 0.2778,
"rewards/accuracies": 0.7425000071525574,
"rewards/chosen": -1.114385962486267,
"rewards/margins": 2.769357919692993,
"rewards/rejected": -3.88374400138855,
"step": 5650
},
{
"epoch": 3.8178164768921636,
"grad_norm": 8.295851707458496,
"learning_rate": 4.3026004728132384e-08,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -188.35511779785156,
"logps/rejected": -294.769287109375,
"loss": 0.2819,
"rewards/accuracies": 0.7637500166893005,
"rewards/chosen": -1.0945212841033936,
"rewards/margins": 2.837056875228882,
"rewards/rejected": -3.9315783977508545,
"step": 5700
},
{
"epoch": 3.8513060951105156,
"grad_norm": 26.289419174194336,
"learning_rate": 3.5145784081954295e-08,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -189.79083251953125,
"logps/rejected": -306.619140625,
"loss": 0.275,
"rewards/accuracies": 0.7674999833106995,
"rewards/chosen": -1.047197699546814,
"rewards/margins": 2.954050064086914,
"rewards/rejected": -4.001247406005859,
"step": 5750
},
{
"epoch": 3.884795713328868,
"grad_norm": 39.87799835205078,
"learning_rate": 2.72655634357762e-08,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -195.89584350585938,
"logps/rejected": -300.447998046875,
"loss": 0.3016,
"rewards/accuracies": 0.7362499833106995,
"rewards/chosen": -1.0226908922195435,
"rewards/margins": 2.853440284729004,
"rewards/rejected": -3.876131057739258,
"step": 5800
},
{
"epoch": 3.9182853315472204,
"grad_norm": 34.19044494628906,
"learning_rate": 1.938534278959811e-08,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -198.2811737060547,
"logps/rejected": -297.48712158203125,
"loss": 0.2798,
"rewards/accuracies": 0.7612500190734863,
"rewards/chosen": -0.9482996463775635,
"rewards/margins": 2.7399239540100098,
"rewards/rejected": -3.6882238388061523,
"step": 5850
},
{
"epoch": 3.9517749497655728,
"grad_norm": 50.39924621582031,
"learning_rate": 1.1505122143420016e-08,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -186.64804077148438,
"logps/rejected": -296.8222961425781,
"loss": 0.2861,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.1375445127487183,
"rewards/margins": 2.8117544651031494,
"rewards/rejected": -3.9492990970611572,
"step": 5900
},
{
"epoch": 3.985264567983925,
"grad_norm": 24.80521583557129,
"learning_rate": 3.6249014972419224e-09,
"logits/chosen": NaN,
"logits/rejected": NaN,
"logps/chosen": -192.86546325683594,
"logps/rejected": -305.5084228515625,
"loss": 0.2767,
"rewards/accuracies": 0.7475000023841858,
"rewards/chosen": -1.0181684494018555,
"rewards/margins": 2.8778107166290283,
"rewards/rejected": -3.895979642868042,
"step": 5950
}
],
"logging_steps": 50,
"max_steps": 5972,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}