{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 500, "global_step": 5972, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.033489618218352314, "grad_norm": 39.56058883666992, "learning_rate": 4.375e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -175.2135009765625, "logps/rejected": -218.9482421875, "loss": 0.6927, "rewards/accuracies": 0.41499999165534973, "rewards/chosen": 0.0009201900684274733, "rewards/margins": 0.0012563117779791355, "rewards/rejected": -0.00033612194238230586, "step": 50 }, { "epoch": 0.06697923643670463, "grad_norm": 55.64208984375, "learning_rate": 8.839285714285714e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -179.50753784179688, "logps/rejected": -224.6807403564453, "loss": 0.6932, "rewards/accuracies": 0.4087499976158142, "rewards/chosen": -0.0016289422055706382, "rewards/margins": 0.0002692897687666118, "rewards/rejected": -0.001898231916129589, "step": 100 }, { "epoch": 0.10046885465505694, "grad_norm": 48.16231918334961, "learning_rate": 1.3303571428571427e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -164.7879638671875, "logps/rejected": -219.03224182128906, "loss": 0.6925, "rewards/accuracies": 0.4325000047683716, "rewards/chosen": -0.0006416282267309725, "rewards/margins": 0.0016548261046409607, "rewards/rejected": -0.0022964540403336287, "step": 150 }, { "epoch": 0.13395847287340926, "grad_norm": 45.87161636352539, "learning_rate": 1.776785714285714e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -176.67550659179688, "logps/rejected": -221.42312622070312, "loss": 0.6933, "rewards/accuracies": 0.4050000011920929, "rewards/chosen": -0.003886653808876872, "rewards/margins": 8.521832205587998e-05, "rewards/rejected": -0.003971872851252556, "step": 200 }, { "epoch": 0.16744809109176156, "grad_norm": 34.68981170654297, "learning_rate": 2.2232142857142856e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -178.92369079589844, "logps/rejected": -222.72586059570312, "loss": 0.6904, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.004237725865095854, "rewards/margins": 0.005978057160973549, "rewards/rejected": -0.010215784423053265, "step": 250 }, { "epoch": 0.20093770931011387, "grad_norm": 45.68313217163086, "learning_rate": 2.669642857142857e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -176.19508361816406, "logps/rejected": -220.88339233398438, "loss": 0.6866, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.010833066888153553, "rewards/margins": 0.014277225360274315, "rewards/rejected": -0.025110295042395592, "step": 300 }, { "epoch": 0.23442732752846618, "grad_norm": 45.355247497558594, "learning_rate": 3.1160714285714285e-07, "logits/chosen": NaN, "logits/rejected": -1.2874314785003662, "logps/chosen": -172.1802520751953, "logps/rejected": -222.34373474121094, "loss": 0.6817, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.022201180458068848, "rewards/margins": 0.024774856865406036, "rewards/rejected": -0.04697604104876518, "step": 350 }, { "epoch": 0.2679169457468185, "grad_norm": 44.401466369628906, "learning_rate": 3.5625e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -171.78598022460938, "logps/rejected": -225.00076293945312, "loss": 0.6694, "rewards/accuracies": 0.5525000095367432, "rewards/chosen": -0.04056182876229286, "rewards/margins": 0.05375281721353531, "rewards/rejected": -0.09431464225053787, "step": 400 }, { "epoch": 0.3014065639651708, "grad_norm": 42.0828742980957, "learning_rate": 4.008928571428571e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -179.93777465820312, "logps/rejected": -225.8251953125, "loss": 0.6615, "rewards/accuracies": 0.5262500047683716, "rewards/chosen": -0.08230926841497421, "rewards/margins": 0.07786127924919128, "rewards/rejected": -0.1601705402135849, "step": 450 }, { "epoch": 0.33489618218352313, "grad_norm": 37.493553161621094, "learning_rate": 4.455357142857143e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -177.60545349121094, "logps/rejected": -223.0897674560547, "loss": 0.6547, "rewards/accuracies": 0.5099999904632568, "rewards/chosen": -0.12488727271556854, "rewards/margins": 0.10414745658636093, "rewards/rejected": -0.22903470695018768, "step": 500 }, { "epoch": 0.3683858004018754, "grad_norm": 42.67152404785156, "learning_rate": 4.901785714285714e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -173.47787475585938, "logps/rejected": -232.50518798828125, "loss": 0.631, "rewards/accuracies": 0.53125, "rewards/chosen": -0.15443742275238037, "rewards/margins": 0.18413911759853363, "rewards/rejected": -0.3385765552520752, "step": 550 }, { "epoch": 0.40187541862022774, "grad_norm": 32.57563018798828, "learning_rate": 5.348214285714285e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -181.27125549316406, "logps/rejected": -241.04226684570312, "loss": 0.6242, "rewards/accuracies": 0.53125, "rewards/chosen": -0.2177175134420395, "rewards/margins": 0.22946205735206604, "rewards/rejected": -0.4471796154975891, "step": 600 }, { "epoch": 0.43536503683858, "grad_norm": 47.70533752441406, "learning_rate": 5.794642857142857e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -195.95242309570312, "logps/rejected": -240.94540405273438, "loss": 0.6072, "rewards/accuracies": 0.5475000143051147, "rewards/chosen": -0.31218427419662476, "rewards/margins": 0.3060773015022278, "rewards/rejected": -0.6182616353034973, "step": 650 }, { "epoch": 0.46885465505693236, "grad_norm": 55.132423400878906, "learning_rate": 6.241071428571429e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -187.1224365234375, "logps/rejected": -239.97381591796875, "loss": 0.598, "rewards/accuracies": 0.5475000143051147, "rewards/chosen": -0.38468390703201294, "rewards/margins": 0.3886369466781616, "rewards/rejected": -0.7733209133148193, "step": 700 }, { "epoch": 0.5023442732752846, "grad_norm": 35.69628143310547, "learning_rate": 6.6875e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -181.9203643798828, "logps/rejected": -238.3597412109375, "loss": 0.6054, "rewards/accuracies": 0.5350000262260437, "rewards/chosen": -0.4038671851158142, "rewards/margins": 0.38458195328712463, "rewards/rejected": -0.7884491086006165, "step": 750 }, { "epoch": 0.535833891493637, "grad_norm": 52.34265899658203, "learning_rate": 7.133928571428571e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -173.30198669433594, "logps/rejected": -238.93728637695312, "loss": 0.5574, "rewards/accuracies": 0.5899999737739563, "rewards/chosen": -0.4324275553226471, "rewards/margins": 0.5758498907089233, "rewards/rejected": -1.008277416229248, "step": 800 }, { "epoch": 0.5693235097119893, "grad_norm": 50.31780242919922, "learning_rate": 7.580357142857143e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -177.4162139892578, "logps/rejected": -239.99441528320312, "loss": 0.5693, "rewards/accuracies": 0.5762500166893005, "rewards/chosen": -0.4396001696586609, "rewards/margins": 0.5518670678138733, "rewards/rejected": -0.991467297077179, "step": 850 }, { "epoch": 0.6028131279303416, "grad_norm": 35.81449508666992, "learning_rate": 7.995271867612292e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -187.32102966308594, "logps/rejected": -241.9414520263672, "loss": 0.5773, "rewards/accuracies": 0.5774999856948853, "rewards/chosen": -0.5533062219619751, "rewards/margins": 0.5976377129554749, "rewards/rejected": -1.1509439945220947, "step": 900 }, { "epoch": 0.6363027461486939, "grad_norm": 37.500022888183594, "learning_rate": 7.916469661150512e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -187.38853454589844, "logps/rejected": -238.58958435058594, "loss": 0.5608, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.5429355502128601, "rewards/margins": 0.6483522057533264, "rewards/rejected": -1.1912877559661865, "step": 950 }, { "epoch": 0.6697923643670463, "grad_norm": 45.89781188964844, "learning_rate": 7.837667454688732e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -175.39967346191406, "logps/rejected": -242.6263427734375, "loss": 0.5456, "rewards/accuracies": 0.5824999809265137, "rewards/chosen": -0.546024739742279, "rewards/margins": 0.7407156229019165, "rewards/rejected": -1.2867404222488403, "step": 1000 }, { "epoch": 0.7032819825853985, "grad_norm": 30.887229919433594, "learning_rate": 7.75886524822695e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -180.49740600585938, "logps/rejected": -256.4322509765625, "loss": 0.5166, "rewards/accuracies": 0.6274999976158142, "rewards/chosen": -0.6010158658027649, "rewards/margins": 0.8881167769432068, "rewards/rejected": -1.4891326427459717, "step": 1050 }, { "epoch": 0.7367716008037508, "grad_norm": 54.525856018066406, "learning_rate": 7.680063041765169e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -169.29653930664062, "logps/rejected": -241.8204345703125, "loss": 0.5213, "rewards/accuracies": 0.5849999785423279, "rewards/chosen": -0.6621356010437012, "rewards/margins": 0.8917463421821594, "rewards/rejected": -1.553882122039795, "step": 1100 }, { "epoch": 0.7702612190221031, "grad_norm": 37.80088424682617, "learning_rate": 7.601260835303388e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -183.52398681640625, "logps/rejected": -246.53189086914062, "loss": 0.5385, "rewards/accuracies": 0.5950000286102295, "rewards/chosen": -0.6485376954078674, "rewards/margins": 0.8500573039054871, "rewards/rejected": -1.4985949993133545, "step": 1150 }, { "epoch": 0.8037508372404555, "grad_norm": 38.089324951171875, "learning_rate": 7.522458628841607e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -191.8751220703125, "logps/rejected": -272.0120849609375, "loss": 0.5175, "rewards/accuracies": 0.6162499785423279, "rewards/chosen": -0.6311337351799011, "rewards/margins": 0.9966024160385132, "rewards/rejected": -1.627736210823059, "step": 1200 }, { "epoch": 0.8372404554588078, "grad_norm": 39.7374153137207, "learning_rate": 7.443656422379827e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -190.6376190185547, "logps/rejected": -257.7156982421875, "loss": 0.5304, "rewards/accuracies": 0.5975000262260437, "rewards/chosen": -0.7539389133453369, "rewards/margins": 1.0195887088775635, "rewards/rejected": -1.7735275030136108, "step": 1250 }, { "epoch": 0.87073007367716, "grad_norm": 30.075101852416992, "learning_rate": 7.364854215918045e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -188.0233154296875, "logps/rejected": -246.95262145996094, "loss": 0.5185, "rewards/accuracies": 0.6162499785423279, "rewards/chosen": -0.6855795383453369, "rewards/margins": 0.9803519248962402, "rewards/rejected": -1.6659313440322876, "step": 1300 }, { "epoch": 0.9042196918955124, "grad_norm": 39.85395431518555, "learning_rate": 7.286052009456264e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -182.7262725830078, "logps/rejected": -253.09869384765625, "loss": 0.5237, "rewards/accuracies": 0.6012499928474426, "rewards/chosen": -0.606716513633728, "rewards/margins": 0.8856968879699707, "rewards/rejected": -1.4924132823944092, "step": 1350 }, { "epoch": 0.9377093101138647, "grad_norm": 66.29072570800781, "learning_rate": 7.207249802994484e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -186.25836181640625, "logps/rejected": -241.17901611328125, "loss": 0.5323, "rewards/accuracies": 0.6012499928474426, "rewards/chosen": -0.6343129873275757, "rewards/margins": 0.9016135931015015, "rewards/rejected": -1.5359266996383667, "step": 1400 }, { "epoch": 0.971198928332217, "grad_norm": 36.76164245605469, "learning_rate": 7.128447596532703e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -182.4203643798828, "logps/rejected": -249.50076293945312, "loss": 0.5378, "rewards/accuracies": 0.5849999785423279, "rewards/chosen": -0.5479399561882019, "rewards/margins": 0.8743146657943726, "rewards/rejected": -1.4222546815872192, "step": 1450 }, { "epoch": 1.0046885465505693, "grad_norm": 53.488407135009766, "learning_rate": 7.049645390070921e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -182.12411499023438, "logps/rejected": -253.10464477539062, "loss": 0.4944, "rewards/accuracies": 0.6324999928474426, "rewards/chosen": -0.5486608147621155, "rewards/margins": 1.0833656787872314, "rewards/rejected": -1.6320266723632812, "step": 1500 }, { "epoch": 1.0381781647689217, "grad_norm": 49.80898666381836, "learning_rate": 6.97084318360914e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -186.8605499267578, "logps/rejected": -265.5860595703125, "loss": 0.4357, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.6526676416397095, "rewards/margins": 1.3229660987854004, "rewards/rejected": -1.9756335020065308, "step": 1550 }, { "epoch": 1.0716677829872738, "grad_norm": 39.074790954589844, "learning_rate": 6.89204097714736e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -185.23265075683594, "logps/rejected": -257.21270751953125, "loss": 0.455, "rewards/accuracies": 0.6762499809265137, "rewards/chosen": -0.6439327001571655, "rewards/margins": 1.2369264364242554, "rewards/rejected": -1.8808592557907104, "step": 1600 }, { "epoch": 1.1051574012056262, "grad_norm": 24.9747257232666, "learning_rate": 6.813238770685579e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -193.9910125732422, "logps/rejected": -257.0543518066406, "loss": 0.4256, "rewards/accuracies": 0.6912500262260437, "rewards/chosen": -0.5875076055526733, "rewards/margins": 1.313684344291687, "rewards/rejected": -1.9011921882629395, "step": 1650 }, { "epoch": 1.1386470194239786, "grad_norm": 27.623506546020508, "learning_rate": 6.734436564223798e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -192.95932006835938, "logps/rejected": -246.83566284179688, "loss": 0.451, "rewards/accuracies": 0.6650000214576721, "rewards/chosen": -0.7364577651023865, "rewards/margins": 1.2061336040496826, "rewards/rejected": -1.9425911903381348, "step": 1700 }, { "epoch": 1.1721366376423308, "grad_norm": 39.15848922729492, "learning_rate": 6.655634357762017e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -182.35861206054688, "logps/rejected": -249.7042236328125, "loss": 0.4414, "rewards/accuracies": 0.6700000166893005, "rewards/chosen": -0.6322548985481262, "rewards/margins": 1.4189176559448242, "rewards/rejected": -2.0511724948883057, "step": 1750 }, { "epoch": 1.2056262558606832, "grad_norm": 46.19662857055664, "learning_rate": 6.576832151300236e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -186.02719116210938, "logps/rejected": -266.9911193847656, "loss": 0.4244, "rewards/accuracies": 0.7012500166893005, "rewards/chosen": -0.6425164937973022, "rewards/margins": 1.4480139017105103, "rewards/rejected": -2.0905306339263916, "step": 1800 }, { "epoch": 1.2391158740790356, "grad_norm": 25.113298416137695, "learning_rate": 6.498029944838455e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -178.42767333984375, "logps/rejected": -260.0691223144531, "loss": 0.4309, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.6952015161514282, "rewards/margins": 1.4248483180999756, "rewards/rejected": -2.1200499534606934, "step": 1850 }, { "epoch": 1.2726054922973877, "grad_norm": 44.80360412597656, "learning_rate": 6.419227738376675e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -186.84951782226562, "logps/rejected": -257.0547790527344, "loss": 0.4364, "rewards/accuracies": 0.6762499809265137, "rewards/chosen": -0.8910938501358032, "rewards/margins": 1.557470679283142, "rewards/rejected": -2.4485647678375244, "step": 1900 }, { "epoch": 1.3060951105157401, "grad_norm": 28.531023025512695, "learning_rate": 6.340425531914892e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -185.15121459960938, "logps/rejected": -261.5601501464844, "loss": 0.4392, "rewards/accuracies": 0.6725000143051147, "rewards/chosen": -0.7338109612464905, "rewards/margins": 1.4515758752822876, "rewards/rejected": -2.185386896133423, "step": 1950 }, { "epoch": 1.3395847287340925, "grad_norm": 28.397706985473633, "learning_rate": 6.261623325453112e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -184.82705688476562, "logps/rejected": -271.1513366699219, "loss": 0.4268, "rewards/accuracies": 0.6800000071525574, "rewards/chosen": -0.7382247447967529, "rewards/margins": 1.5511670112609863, "rewards/rejected": -2.2893919944763184, "step": 2000 }, { "epoch": 1.3730743469524447, "grad_norm": 35.965423583984375, "learning_rate": 6.182821118991332e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -192.85757446289062, "logps/rejected": -273.0481262207031, "loss": 0.4352, "rewards/accuracies": 0.6850000023841858, "rewards/chosen": -0.8608375787734985, "rewards/margins": 1.4874813556671143, "rewards/rejected": -2.3483190536499023, "step": 2050 }, { "epoch": 1.406563965170797, "grad_norm": 38.594947814941406, "learning_rate": 6.10401891252955e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -195.38641357421875, "logps/rejected": -278.737548828125, "loss": 0.4166, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7014285922050476, "rewards/margins": 1.5582598447799683, "rewards/rejected": -2.259688377380371, "step": 2100 }, { "epoch": 1.4400535833891492, "grad_norm": 35.8038215637207, "learning_rate": 6.025216706067769e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -188.76324462890625, "logps/rejected": -275.5191650390625, "loss": 0.404, "rewards/accuracies": 0.7087500095367432, "rewards/chosen": -0.6745861172676086, "rewards/margins": 1.6506869792938232, "rewards/rejected": -2.325273036956787, "step": 2150 }, { "epoch": 1.4735432016075016, "grad_norm": 29.62619972229004, "learning_rate": 5.946414499605989e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -187.25250244140625, "logps/rejected": -272.7938232421875, "loss": 0.398, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.6346092224121094, "rewards/margins": 1.7176785469055176, "rewards/rejected": -2.352287769317627, "step": 2200 }, { "epoch": 1.507032819825854, "grad_norm": 21.93035316467285, "learning_rate": 5.867612293144208e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -191.32916259765625, "logps/rejected": -260.4537048339844, "loss": 0.4424, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.6742202639579773, "rewards/margins": 1.4148352146148682, "rewards/rejected": -2.0890555381774902, "step": 2250 }, { "epoch": 1.5405224380442064, "grad_norm": 15.887839317321777, "learning_rate": 5.788810086682427e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -190.4200439453125, "logps/rejected": -274.1068420410156, "loss": 0.4002, "rewards/accuracies": 0.6837499737739563, "rewards/chosen": -0.6341544985771179, "rewards/margins": 1.6712123155593872, "rewards/rejected": -2.3053667545318604, "step": 2300 }, { "epoch": 1.5740120562625586, "grad_norm": 46.13706588745117, "learning_rate": 5.710007880220646e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -184.80099487304688, "logps/rejected": -268.6807861328125, "loss": 0.4078, "rewards/accuracies": 0.6899999976158142, "rewards/chosen": -0.7255478501319885, "rewards/margins": 1.6650432348251343, "rewards/rejected": -2.3905911445617676, "step": 2350 }, { "epoch": 1.607501674480911, "grad_norm": 41.01249313354492, "learning_rate": 5.631205673758865e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -189.23605346679688, "logps/rejected": -268.6483154296875, "loss": 0.4133, "rewards/accuracies": 0.6787499785423279, "rewards/chosen": -0.8096724152565002, "rewards/margins": 1.5884754657745361, "rewards/rejected": -2.3981478214263916, "step": 2400 }, { "epoch": 1.6409912926992631, "grad_norm": 44.18216323852539, "learning_rate": 5.552403467297084e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -206.16220092773438, "logps/rejected": -275.8337097167969, "loss": 0.42, "rewards/accuracies": 0.6887500286102295, "rewards/chosen": -0.776029109954834, "rewards/margins": 1.5833215713500977, "rewards/rejected": -2.3593506813049316, "step": 2450 }, { "epoch": 1.6744809109176155, "grad_norm": 38.77958679199219, "learning_rate": 5.473601260835303e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -191.59031677246094, "logps/rejected": -268.516357421875, "loss": 0.4145, "rewards/accuracies": 0.6850000023841858, "rewards/chosen": -0.6448932886123657, "rewards/margins": 1.565537691116333, "rewards/rejected": -2.210430860519409, "step": 2500 }, { "epoch": 1.707970529135968, "grad_norm": 36.07415008544922, "learning_rate": 5.394799054373523e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -183.8679656982422, "logps/rejected": -262.7795104980469, "loss": 0.3909, "rewards/accuracies": 0.7037500143051147, "rewards/chosen": -0.6056129336357117, "rewards/margins": 1.6474745273590088, "rewards/rejected": -2.2530875205993652, "step": 2550 }, { "epoch": 1.7414601473543203, "grad_norm": 59.49274826049805, "learning_rate": 5.315996847911741e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -186.5425567626953, "logps/rejected": -271.7103576660156, "loss": 0.3796, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.7344449758529663, "rewards/margins": 1.7059468030929565, "rewards/rejected": -2.440391778945923, "step": 2600 }, { "epoch": 1.7749497655726725, "grad_norm": 41.25373077392578, "learning_rate": 5.23719464144996e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -186.00929260253906, "logps/rejected": -268.0653076171875, "loss": 0.3793, "rewards/accuracies": 0.7099999785423279, "rewards/chosen": -0.7967619895935059, "rewards/margins": 1.9842884540557861, "rewards/rejected": -2.781050443649292, "step": 2650 }, { "epoch": 1.8084393837910246, "grad_norm": 37.203304290771484, "learning_rate": 5.15839243498818e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -187.30181884765625, "logps/rejected": -272.9234313964844, "loss": 0.4057, "rewards/accuracies": 0.6825000047683716, "rewards/chosen": -0.7267603874206543, "rewards/margins": 1.7311309576034546, "rewards/rejected": -2.4578914642333984, "step": 2700 }, { "epoch": 1.841929002009377, "grad_norm": 30.321313858032227, "learning_rate": 5.079590228526398e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -189.3111114501953, "logps/rejected": -270.3074645996094, "loss": 0.4081, "rewards/accuracies": 0.6800000071525574, "rewards/chosen": -0.751646101474762, "rewards/margins": 1.640588641166687, "rewards/rejected": -2.3922348022460938, "step": 2750 }, { "epoch": 1.8754186202277294, "grad_norm": 47.42466735839844, "learning_rate": 5.000788022064617e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -179.08119201660156, "logps/rejected": -277.5658264160156, "loss": 0.3837, "rewards/accuracies": 0.6949999928474426, "rewards/chosen": -0.6461160182952881, "rewards/margins": 1.8302369117736816, "rewards/rejected": -2.476353168487549, "step": 2800 }, { "epoch": 1.9089082384460818, "grad_norm": 37.78199005126953, "learning_rate": 4.921985815602837e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -195.4869384765625, "logps/rejected": -276.9580383300781, "loss": 0.3995, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7339184284210205, "rewards/margins": 1.7630858421325684, "rewards/rejected": -2.4970040321350098, "step": 2850 }, { "epoch": 1.942397856664434, "grad_norm": 42.2109375, "learning_rate": 4.843183609141055e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -199.12640380859375, "logps/rejected": -275.89630126953125, "loss": 0.4105, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6928785443305969, "rewards/margins": 1.7767040729522705, "rewards/rejected": -2.4695825576782227, "step": 2900 }, { "epoch": 1.9758874748827864, "grad_norm": 50.837059020996094, "learning_rate": 4.764381402679275e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -187.2418212890625, "logps/rejected": -285.7373046875, "loss": 0.3963, "rewards/accuracies": 0.7012500166893005, "rewards/chosen": -0.7950295805931091, "rewards/margins": 1.9002233743667603, "rewards/rejected": -2.6952526569366455, "step": 2950 }, { "epoch": 2.0093770931011385, "grad_norm": 24.105684280395508, "learning_rate": 4.685579196217494e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -175.44351196289062, "logps/rejected": -268.713134765625, "loss": 0.3725, "rewards/accuracies": 0.7087500095367432, "rewards/chosen": -0.7846677303314209, "rewards/margins": 1.9885753393173218, "rewards/rejected": -2.7732431888580322, "step": 3000 }, { "epoch": 2.042866711319491, "grad_norm": 21.2971248626709, "learning_rate": 4.606776989755713e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -189.33584594726562, "logps/rejected": -282.147705078125, "loss": 0.3284, "rewards/accuracies": 0.7512500286102295, "rewards/chosen": -0.6468074917793274, "rewards/margins": 2.03031849861145, "rewards/rejected": -2.677126169204712, "step": 3050 }, { "epoch": 2.0763563295378433, "grad_norm": 13.780171394348145, "learning_rate": 4.527974783293932e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -191.3354949951172, "logps/rejected": -282.95489501953125, "loss": 0.3183, "rewards/accuracies": 0.7549999952316284, "rewards/chosen": -0.7194473147392273, "rewards/margins": 2.1648471355438232, "rewards/rejected": -2.884294033050537, "step": 3100 }, { "epoch": 2.1098459477561957, "grad_norm": 27.040647506713867, "learning_rate": 4.449172576832151e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -187.97789001464844, "logps/rejected": -267.7035217285156, "loss": 0.3298, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.7726535797119141, "rewards/margins": 2.0831313133239746, "rewards/rejected": -2.8557848930358887, "step": 3150 }, { "epoch": 2.1433355659745477, "grad_norm": 52.36626052856445, "learning_rate": 4.37037037037037e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -187.05853271484375, "logps/rejected": -278.2367248535156, "loss": 0.3381, "rewards/accuracies": 0.7287499904632568, "rewards/chosen": -0.7423791289329529, "rewards/margins": 2.1862621307373047, "rewards/rejected": -2.9286410808563232, "step": 3200 }, { "epoch": 2.1768251841929, "grad_norm": 44.21054458618164, "learning_rate": 4.2915681639085896e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -194.27188110351562, "logps/rejected": -286.78887939453125, "loss": 0.3391, "rewards/accuracies": 0.7212499976158142, "rewards/chosen": -0.7143966555595398, "rewards/margins": 2.098987579345703, "rewards/rejected": -2.8133840560913086, "step": 3250 }, { "epoch": 2.2103148024112524, "grad_norm": 55.730262756347656, "learning_rate": 4.212765957446808e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -184.7841796875, "logps/rejected": -289.6191101074219, "loss": 0.3417, "rewards/accuracies": 0.7174999713897705, "rewards/chosen": -0.747239351272583, "rewards/margins": 2.2462081909179688, "rewards/rejected": -2.993447780609131, "step": 3300 }, { "epoch": 2.243804420629605, "grad_norm": 34.05455017089844, "learning_rate": 4.1339637509850275e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -188.21963500976562, "logps/rejected": -282.9302062988281, "loss": 0.3288, "rewards/accuracies": 0.7275000214576721, "rewards/chosen": -0.8579057455062866, "rewards/margins": 2.2049171924591064, "rewards/rejected": -3.0628225803375244, "step": 3350 }, { "epoch": 2.2772940388479572, "grad_norm": 32.748878479003906, "learning_rate": 4.0551615445232467e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -194.06427001953125, "logps/rejected": -288.862548828125, "loss": 0.3154, "rewards/accuracies": 0.75, "rewards/chosen": -0.9077091217041016, "rewards/margins": 2.30654239654541, "rewards/rejected": -3.2142515182495117, "step": 3400 }, { "epoch": 2.3107836570663096, "grad_norm": 26.387239456176758, "learning_rate": 3.976359338061466e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -184.67759704589844, "logps/rejected": -284.309326171875, "loss": 0.3273, "rewards/accuracies": 0.7287499904632568, "rewards/chosen": -0.9315968155860901, "rewards/margins": 2.26393985748291, "rewards/rejected": -3.1955366134643555, "step": 3450 }, { "epoch": 2.3442732752846616, "grad_norm": 26.215883255004883, "learning_rate": 3.8975571315996845e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -193.22149658203125, "logps/rejected": -286.2984313964844, "loss": 0.3245, "rewards/accuracies": 0.7200000286102295, "rewards/chosen": -0.839231014251709, "rewards/margins": 2.3949248790740967, "rewards/rejected": -3.2341556549072266, "step": 3500 }, { "epoch": 2.377762893503014, "grad_norm": 45.05733108520508, "learning_rate": 3.8187549251379037e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -192.80043029785156, "logps/rejected": -286.0914306640625, "loss": 0.3323, "rewards/accuracies": 0.7162500023841858, "rewards/chosen": -0.8909017443656921, "rewards/margins": 2.335562229156494, "rewards/rejected": -3.226464033126831, "step": 3550 }, { "epoch": 2.4112525117213663, "grad_norm": 23.256881713867188, "learning_rate": 3.739952718676123e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -197.46514892578125, "logps/rejected": -286.53509521484375, "loss": 0.3074, "rewards/accuracies": 0.7787500023841858, "rewards/chosen": -0.8636592030525208, "rewards/margins": 2.3110320568084717, "rewards/rejected": -3.1746912002563477, "step": 3600 }, { "epoch": 2.4447421299397187, "grad_norm": 29.816986083984375, "learning_rate": 3.661150512214342e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -186.59291076660156, "logps/rejected": -285.0599365234375, "loss": 0.3406, "rewards/accuracies": 0.7225000262260437, "rewards/chosen": -0.8795223236083984, "rewards/margins": 2.274043083190918, "rewards/rejected": -3.1535654067993164, "step": 3650 }, { "epoch": 2.478231748158071, "grad_norm": 58.20719528198242, "learning_rate": 3.582348305752561e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -193.24021911621094, "logps/rejected": -280.94549560546875, "loss": 0.351, "rewards/accuracies": 0.7300000190734863, "rewards/chosen": -0.9534088373184204, "rewards/margins": 2.271278142929077, "rewards/rejected": -3.224686861038208, "step": 3700 }, { "epoch": 2.511721366376423, "grad_norm": 61.28821563720703, "learning_rate": 3.50354609929078e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -182.4898681640625, "logps/rejected": -285.96759033203125, "loss": 0.3099, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.9077944755554199, "rewards/margins": 2.507885217666626, "rewards/rejected": -3.4156792163848877, "step": 3750 }, { "epoch": 2.5452109845947755, "grad_norm": 31.814802169799805, "learning_rate": 3.424743892828999e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -192.01673889160156, "logps/rejected": -291.91192626953125, "loss": 0.3273, "rewards/accuracies": 0.7487499713897705, "rewards/chosen": -1.0059828758239746, "rewards/margins": 2.3609018325805664, "rewards/rejected": -3.366884469985962, "step": 3800 }, { "epoch": 2.578700602813128, "grad_norm": 27.818708419799805, "learning_rate": 3.345941686367218e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -193.6220703125, "logps/rejected": -282.2691345214844, "loss": 0.3249, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.8329002261161804, "rewards/margins": 2.3765745162963867, "rewards/rejected": -3.209474802017212, "step": 3850 }, { "epoch": 2.6121902210314802, "grad_norm": 43.8161735534668, "learning_rate": 3.2671394799054374e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -188.08872985839844, "logps/rejected": -288.9317932128906, "loss": 0.3261, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.8284339308738708, "rewards/margins": 2.3834288120269775, "rewards/rejected": -3.211862564086914, "step": 3900 }, { "epoch": 2.6456798392498326, "grad_norm": 40.35237503051758, "learning_rate": 3.188337273443656e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -188.78111267089844, "logps/rejected": -292.10601806640625, "loss": 0.3219, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.8962965607643127, "rewards/margins": 2.3189728260040283, "rewards/rejected": -3.2152698040008545, "step": 3950 }, { "epoch": 2.679169457468185, "grad_norm": 51.2871208190918, "learning_rate": 3.1095350669818753e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -197.29296875, "logps/rejected": -297.2159729003906, "loss": 0.3349, "rewards/accuracies": 0.7425000071525574, "rewards/chosen": -0.9539132118225098, "rewards/margins": 2.459329605102539, "rewards/rejected": -3.4132425785064697, "step": 4000 }, { "epoch": 2.7126590756865374, "grad_norm": 26.952434539794922, "learning_rate": 3.0307328605200945e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -197.95140075683594, "logps/rejected": -294.74676513671875, "loss": 0.3164, "rewards/accuracies": 0.7425000071525574, "rewards/chosen": -0.8712408542633057, "rewards/margins": 2.3473589420318604, "rewards/rejected": -3.218599796295166, "step": 4050 }, { "epoch": 2.7461486939048894, "grad_norm": 23.623044967651367, "learning_rate": 2.9519306540583136e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -188.4278106689453, "logps/rejected": -270.7493896484375, "loss": 0.3482, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.8394105434417725, "rewards/margins": 2.1500954627990723, "rewards/rejected": -2.989506244659424, "step": 4100 }, { "epoch": 2.7796383121232418, "grad_norm": 33.7256965637207, "learning_rate": 2.8731284475965323e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -190.12367248535156, "logps/rejected": -286.8393859863281, "loss": 0.3118, "rewards/accuracies": 0.7450000047683716, "rewards/chosen": -0.8797828555107117, "rewards/margins": 2.488528251647949, "rewards/rejected": -3.3683111667633057, "step": 4150 }, { "epoch": 2.813127930341594, "grad_norm": 26.706951141357422, "learning_rate": 2.7943262411347515e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -190.7408447265625, "logps/rejected": -285.0479736328125, "loss": 0.3168, "rewards/accuracies": 0.7350000143051147, "rewards/chosen": -0.8466379642486572, "rewards/margins": 2.4300098419189453, "rewards/rejected": -3.2766480445861816, "step": 4200 }, { "epoch": 2.8466175485599465, "grad_norm": 38.752418518066406, "learning_rate": 2.7155240346729707e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -198.39134216308594, "logps/rejected": -290.9305725097656, "loss": 0.3135, "rewards/accuracies": 0.7450000047683716, "rewards/chosen": -0.9479385614395142, "rewards/margins": 2.4617414474487305, "rewards/rejected": -3.409679889678955, "step": 4250 }, { "epoch": 2.8801071667782985, "grad_norm": 22.20810317993164, "learning_rate": 2.63672182821119e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -202.04925537109375, "logps/rejected": -298.950927734375, "loss": 0.2977, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.897186815738678, "rewards/margins": 2.3512582778930664, "rewards/rejected": -3.2484447956085205, "step": 4300 }, { "epoch": 2.913596784996651, "grad_norm": 66.3785171508789, "learning_rate": 2.557919621749409e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -199.11300659179688, "logps/rejected": -298.3018798828125, "loss": 0.3053, "rewards/accuracies": 0.7587500214576721, "rewards/chosen": -0.9929912090301514, "rewards/margins": 2.481295585632324, "rewards/rejected": -3.4742870330810547, "step": 4350 }, { "epoch": 2.9470864032150033, "grad_norm": 37.80686569213867, "learning_rate": 2.4791174152876277e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -202.31398010253906, "logps/rejected": -276.3575439453125, "loss": 0.3206, "rewards/accuracies": 0.7475000023841858, "rewards/chosen": -0.9125861525535583, "rewards/margins": 2.299701690673828, "rewards/rejected": -3.2122879028320312, "step": 4400 }, { "epoch": 2.9805760214333556, "grad_norm": 125.0142593383789, "learning_rate": 2.4003152088258474e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -187.2313232421875, "logps/rejected": -284.8965759277344, "loss": 0.3186, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.8962329030036926, "rewards/margins": 2.4796411991119385, "rewards/rejected": -3.3758738040924072, "step": 4450 }, { "epoch": 3.014065639651708, "grad_norm": 36.955318450927734, "learning_rate": 2.321513002364066e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -191.8556671142578, "logps/rejected": -289.2830810546875, "loss": 0.3003, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.953887403011322, "rewards/margins": 2.597318410873413, "rewards/rejected": -3.551206111907959, "step": 4500 }, { "epoch": 3.0475552578700604, "grad_norm": 35.30141830444336, "learning_rate": 2.242710795902285e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -201.59471130371094, "logps/rejected": -296.01190185546875, "loss": 0.2955, "rewards/accuracies": 0.7574999928474426, "rewards/chosen": -0.8759480118751526, "rewards/margins": 2.536679744720459, "rewards/rejected": -3.4126272201538086, "step": 4550 }, { "epoch": 3.081044876088413, "grad_norm": 19.832714080810547, "learning_rate": 2.163908589440504e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -187.59324645996094, "logps/rejected": -301.8227233886719, "loss": 0.2666, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.9135188460350037, "rewards/margins": 2.812459707260132, "rewards/rejected": -3.7259786128997803, "step": 4600 }, { "epoch": 3.1145344943067648, "grad_norm": 20.465190887451172, "learning_rate": 2.0851063829787233e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -188.68150329589844, "logps/rejected": -292.4168701171875, "loss": 0.2952, "rewards/accuracies": 0.7337499856948853, "rewards/chosen": -0.7880871295928955, "rewards/margins": 2.723849296569824, "rewards/rejected": -3.5119359493255615, "step": 4650 }, { "epoch": 3.148024112525117, "grad_norm": 16.645957946777344, "learning_rate": 2.0063041765169423e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -181.14376831054688, "logps/rejected": -276.7168884277344, "loss": 0.2973, "rewards/accuracies": 0.7350000143051147, "rewards/chosen": -0.8847752213478088, "rewards/margins": 2.6361799240112305, "rewards/rejected": -3.5209546089172363, "step": 4700 }, { "epoch": 3.1815137307434695, "grad_norm": 16.746036529541016, "learning_rate": 1.9275019700551615e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -191.2080078125, "logps/rejected": -303.0468444824219, "loss": 0.2812, "rewards/accuracies": 0.7524999976158142, "rewards/chosen": -0.9689957499504089, "rewards/margins": 2.876345157623291, "rewards/rejected": -3.845341205596924, "step": 4750 }, { "epoch": 3.215003348961822, "grad_norm": 22.3815975189209, "learning_rate": 1.8486997635933806e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -203.40260314941406, "logps/rejected": -296.6044616699219, "loss": 0.308, "rewards/accuracies": 0.7475000023841858, "rewards/chosen": -0.9666973352432251, "rewards/margins": 2.647951364517212, "rewards/rejected": -3.6146488189697266, "step": 4800 }, { "epoch": 3.2484929671801743, "grad_norm": 22.031641006469727, "learning_rate": 1.7698975571315996e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -184.9111328125, "logps/rejected": -287.4691162109375, "loss": 0.2826, "rewards/accuracies": 0.7450000047683716, "rewards/chosen": -0.9380254149436951, "rewards/margins": 2.7892355918884277, "rewards/rejected": -3.7272610664367676, "step": 4850 }, { "epoch": 3.2819825853985263, "grad_norm": 31.574790954589844, "learning_rate": 1.6910953506698187e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -191.68785095214844, "logps/rejected": -300.5950622558594, "loss": 0.273, "rewards/accuracies": 0.7674999833106995, "rewards/chosen": -0.9448862671852112, "rewards/margins": 2.9543023109436035, "rewards/rejected": -3.899188756942749, "step": 4900 }, { "epoch": 3.3154722036168787, "grad_norm": 20.961929321289062, "learning_rate": 1.612293144208038e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -189.8438720703125, "logps/rejected": -291.8954162597656, "loss": 0.2805, "rewards/accuracies": 0.7475000023841858, "rewards/chosen": -0.9848695397377014, "rewards/margins": 2.715031385421753, "rewards/rejected": -3.6999011039733887, "step": 4950 }, { "epoch": 3.348961821835231, "grad_norm": 81.02938079833984, "learning_rate": 1.5334909377462568e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -194.07749938964844, "logps/rejected": -288.35015869140625, "loss": 0.2876, "rewards/accuracies": 0.7537500262260437, "rewards/chosen": -1.0682913064956665, "rewards/margins": 2.6784591674804688, "rewards/rejected": -3.746750593185425, "step": 5000 }, { "epoch": 3.3824514400535834, "grad_norm": 49.065887451171875, "learning_rate": 1.454688731284476e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -203.08660888671875, "logps/rejected": -315.59271240234375, "loss": 0.2685, "rewards/accuracies": 0.7699999809265137, "rewards/chosen": -0.986056923866272, "rewards/margins": 3.007080078125, "rewards/rejected": -3.9931368827819824, "step": 5050 }, { "epoch": 3.415941058271936, "grad_norm": 21.368894577026367, "learning_rate": 1.375886524822695e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -198.26324462890625, "logps/rejected": -298.6892395019531, "loss": 0.2805, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.0937081575393677, "rewards/margins": 2.7569918632507324, "rewards/rejected": -3.8506996631622314, "step": 5100 }, { "epoch": 3.4494306764902882, "grad_norm": 18.683109283447266, "learning_rate": 1.2970843183609141e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -205.4937286376953, "logps/rejected": -288.3065185546875, "loss": 0.2852, "rewards/accuracies": 0.7662500143051147, "rewards/chosen": -1.0320967435836792, "rewards/margins": 2.5120697021484375, "rewards/rejected": -3.544166326522827, "step": 5150 }, { "epoch": 3.48292029470864, "grad_norm": 38.7464599609375, "learning_rate": 1.218282111899133e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -194.1973876953125, "logps/rejected": -299.29559326171875, "loss": 0.2871, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.104549527168274, "rewards/margins": 2.853426218032837, "rewards/rejected": -3.9579761028289795, "step": 5200 }, { "epoch": 3.5164099129269926, "grad_norm": 7.350229740142822, "learning_rate": 1.1394799054373522e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -201.8234405517578, "logps/rejected": -290.8706970214844, "loss": 0.3006, "rewards/accuracies": 0.7387499809265137, "rewards/chosen": -1.0995410680770874, "rewards/margins": 2.636826992034912, "rewards/rejected": -3.736368417739868, "step": 5250 }, { "epoch": 3.549899531145345, "grad_norm": 19.686573028564453, "learning_rate": 1.0606776989755713e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -190.27627563476562, "logps/rejected": -288.27923583984375, "loss": 0.2949, "rewards/accuracies": 0.7524999976158142, "rewards/chosen": -1.0866522789001465, "rewards/margins": 2.7148284912109375, "rewards/rejected": -3.801480770111084, "step": 5300 }, { "epoch": 3.5833891493636973, "grad_norm": 23.88344955444336, "learning_rate": 9.818754925137903e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -196.48269653320312, "logps/rejected": -295.1553039550781, "loss": 0.2976, "rewards/accuracies": 0.7262499928474426, "rewards/chosen": -1.0540306568145752, "rewards/margins": 2.6128809452056885, "rewards/rejected": -3.6669113636016846, "step": 5350 }, { "epoch": 3.6168787675820493, "grad_norm": 28.22519874572754, "learning_rate": 9.030732860520094e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -196.87808227539062, "logps/rejected": -294.7259216308594, "loss": 0.2903, "rewards/accuracies": 0.7487499713897705, "rewards/chosen": -1.0297123193740845, "rewards/margins": 2.7472548484802246, "rewards/rejected": -3.7769670486450195, "step": 5400 }, { "epoch": 3.6503683858004017, "grad_norm": 34.18745422363281, "learning_rate": 8.242710795902284e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -195.99404907226562, "logps/rejected": -290.40350341796875, "loss": 0.2871, "rewards/accuracies": 0.7524999976158142, "rewards/chosen": -1.050000786781311, "rewards/margins": 2.8044447898864746, "rewards/rejected": -3.8544461727142334, "step": 5450 }, { "epoch": 3.683858004018754, "grad_norm": 47.866973876953125, "learning_rate": 7.454688731284475e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -200.01785278320312, "logps/rejected": -314.4879150390625, "loss": 0.2842, "rewards/accuracies": 0.7637500166893005, "rewards/chosen": -1.1343244314193726, "rewards/margins": 2.814058780670166, "rewards/rejected": -3.9483835697174072, "step": 5500 }, { "epoch": 3.7173476222371065, "grad_norm": 42.807334899902344, "learning_rate": 6.666666666666665e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -195.06690979003906, "logps/rejected": -304.86456298828125, "loss": 0.2987, "rewards/accuracies": 0.7262499928474426, "rewards/chosen": -0.996014416217804, "rewards/margins": 2.8309714794158936, "rewards/rejected": -3.826986074447632, "step": 5550 }, { "epoch": 3.750837240455459, "grad_norm": 43.85563278198242, "learning_rate": 5.8786446020488567e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -192.9456787109375, "logps/rejected": -302.6964111328125, "loss": 0.2717, "rewards/accuracies": 0.7674999833106995, "rewards/chosen": -0.9534288644790649, "rewards/margins": 2.9733974933624268, "rewards/rejected": -3.926826238632202, "step": 5600 }, { "epoch": 3.7843268586738112, "grad_norm": 15.730545043945312, "learning_rate": 5.090622537431047e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -201.83468627929688, "logps/rejected": -303.9621887207031, "loss": 0.2778, "rewards/accuracies": 0.7425000071525574, "rewards/chosen": -1.114385962486267, "rewards/margins": 2.769357919692993, "rewards/rejected": -3.88374400138855, "step": 5650 }, { "epoch": 3.8178164768921636, "grad_norm": 8.295851707458496, "learning_rate": 4.3026004728132384e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -188.35511779785156, "logps/rejected": -294.769287109375, "loss": 0.2819, "rewards/accuracies": 0.7637500166893005, "rewards/chosen": -1.0945212841033936, "rewards/margins": 2.837056875228882, "rewards/rejected": -3.9315783977508545, "step": 5700 }, { "epoch": 3.8513060951105156, "grad_norm": 26.289419174194336, "learning_rate": 3.5145784081954295e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -189.79083251953125, "logps/rejected": -306.619140625, "loss": 0.275, "rewards/accuracies": 0.7674999833106995, "rewards/chosen": -1.047197699546814, "rewards/margins": 2.954050064086914, "rewards/rejected": -4.001247406005859, "step": 5750 }, { "epoch": 3.884795713328868, "grad_norm": 39.87799835205078, "learning_rate": 2.72655634357762e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -195.89584350585938, "logps/rejected": -300.447998046875, "loss": 0.3016, "rewards/accuracies": 0.7362499833106995, "rewards/chosen": -1.0226908922195435, "rewards/margins": 2.853440284729004, "rewards/rejected": -3.876131057739258, "step": 5800 }, { "epoch": 3.9182853315472204, "grad_norm": 34.19044494628906, "learning_rate": 1.938534278959811e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -198.2811737060547, "logps/rejected": -297.48712158203125, "loss": 0.2798, "rewards/accuracies": 0.7612500190734863, "rewards/chosen": -0.9482996463775635, "rewards/margins": 2.7399239540100098, "rewards/rejected": -3.6882238388061523, "step": 5850 }, { "epoch": 3.9517749497655728, "grad_norm": 50.39924621582031, "learning_rate": 1.1505122143420016e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -186.64804077148438, "logps/rejected": -296.8222961425781, "loss": 0.2861, "rewards/accuracies": 0.75, "rewards/chosen": -1.1375445127487183, "rewards/margins": 2.8117544651031494, "rewards/rejected": -3.9492990970611572, "step": 5900 }, { "epoch": 3.985264567983925, "grad_norm": 24.80521583557129, "learning_rate": 3.6249014972419224e-09, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -192.86546325683594, "logps/rejected": -305.5084228515625, "loss": 0.2767, "rewards/accuracies": 0.7475000023841858, "rewards/chosen": -1.0181684494018555, "rewards/margins": 2.8778107166290283, "rewards/rejected": -3.895979642868042, "step": 5950 } ], "logging_steps": 50, "max_steps": 5972, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }