{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 10000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0002, "grad_norm": 153.0, "learning_rate": 0.0, "logits/chosen": 0.38140869140625, "logits/rejected": -0.0750732421875, "logps/chosen": -179.625, "logps/rejected": -175.5, "loss": 0.6914, "rewards/accuracies": 0.0, "rewards/chosen": 0.01251220703125, "rewards/margins": 0.0, "rewards/rejected": 0.01251220703125, "step": 1 }, { "epoch": 0.01, "grad_norm": 152.0, "learning_rate": 1.6333333333333334e-07, "logits/chosen": -0.07141295820474625, "logits/rejected": -0.49311113357543945, "logps/chosen": -169.4234619140625, "logps/rejected": -166.2244873046875, "loss": 0.6897, "rewards/accuracies": 0.32397958636283875, "rewards/chosen": 0.022099709138274193, "rewards/margins": 0.00855239573866129, "rewards/rejected": 0.013539839535951614, "step": 50 }, { "epoch": 0.02, "grad_norm": 173.0, "learning_rate": 3.3e-07, "logits/chosen": -0.19355152547359467, "logits/rejected": -0.6559900045394897, "logps/chosen": -169.9737548828125, "logps/rejected": -168.53875732421875, "loss": 0.6714, "rewards/accuracies": 0.47999998927116394, "rewards/chosen": 0.056079406291246414, "rewards/margins": 0.04595337063074112, "rewards/rejected": 0.010139770805835724, "step": 100 }, { "epoch": 0.03, "grad_norm": 159.0, "learning_rate": 4.966666666666666e-07, "logits/chosen": -0.1707330346107483, "logits/rejected": -0.6063339114189148, "logps/chosen": -168.2937469482422, "logps/rejected": -166.5187530517578, "loss": 0.6156, "rewards/accuracies": 0.7900000214576721, "rewards/chosen": 0.1966903656721115, "rewards/margins": 0.16668151319026947, "rewards/rejected": 0.030118407681584358, "step": 150 }, { "epoch": 0.04, "grad_norm": 87.0, "learning_rate": 6.633333333333334e-07, "logits/chosen": -0.08012771606445312, "logits/rejected": -0.5313219428062439, "logps/chosen": -163.4462432861328, "logps/rejected": -166.76124572753906, "loss": 0.5115, "rewards/accuracies": 0.8550000190734863, "rewards/chosen": 0.4547726511955261, "rewards/margins": 0.4213232398033142, "rewards/rejected": 0.03348724544048309, "step": 200 }, { "epoch": 0.05, "grad_norm": 67.5, "learning_rate": 8.300000000000001e-07, "logits/chosen": -0.17801956832408905, "logits/rejected": -0.6736994981765747, "logps/chosen": -162.8975067138672, "logps/rejected": -169.00250244140625, "loss": 0.3636, "rewards/accuracies": 0.8974999785423279, "rewards/chosen": 0.7460748553276062, "rewards/margins": 0.8772411942481995, "rewards/rejected": -0.1307925432920456, "step": 250 }, { "epoch": 0.06, "grad_norm": 29.625, "learning_rate": 9.966666666666667e-07, "logits/chosen": -0.3131498694419861, "logits/rejected": -0.8370306491851807, "logps/chosen": -156.57749938964844, "logps/rejected": -171.12249755859375, "loss": 0.2411, "rewards/accuracies": 0.8725000023841858, "rewards/chosen": 0.9195745587348938, "rewards/margins": 1.5089257955551147, "rewards/rejected": -0.5890390276908875, "step": 300 }, { "epoch": 0.07, "grad_norm": 8.625, "learning_rate": 1.1633333333333333e-06, "logits/chosen": -0.809218168258667, "logits/rejected": -1.343685269355774, "logps/chosen": -165.8387451171875, "logps/rejected": -189.9462432861328, "loss": 0.1339, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.9223541021347046, "rewards/margins": 2.6102733612060547, "rewards/rejected": -1.6883777379989624, "step": 350 }, { "epoch": 0.08, "grad_norm": 2.234375, "learning_rate": 1.3300000000000002e-06, "logits/chosen": -1.2586804628372192, "logits/rejected": -1.9825918674468994, "logps/chosen": -158.1649932861328, "logps/rejected": -198.03750610351562, "loss": 0.0961, "rewards/accuracies": 0.8799999952316284, "rewards/chosen": 1.2969841957092285, "rewards/margins": 4.182460784912109, "rewards/rejected": -2.8857672214508057, "step": 400 }, { "epoch": 0.09, "grad_norm": 0.349609375, "learning_rate": 1.4966666666666668e-06, "logits/chosen": -1.5829663276672363, "logits/rejected": -2.467480421066284, "logps/chosen": -147.5625, "logps/rejected": -204.00750732421875, "loss": 0.1038, "rewards/accuracies": 0.8525000214576721, "rewards/chosen": 1.9529736042022705, "rewards/margins": 5.97265625, "rewards/rejected": -4.018417835235596, "step": 450 }, { "epoch": 0.1, "grad_norm": 0.5625, "learning_rate": 1.6633333333333334e-06, "logits/chosen": -1.8108301162719727, "logits/rejected": -2.774589776992798, "logps/chosen": -147.88250732421875, "logps/rejected": -219.89625549316406, "loss": 0.0802, "rewards/accuracies": 0.8849999904632568, "rewards/chosen": 2.4084813594818115, "rewards/margins": 7.462968826293945, "rewards/rejected": -5.055732250213623, "step": 500 }, { "epoch": 0.11, "grad_norm": 0.21484375, "learning_rate": 1.83e-06, "logits/chosen": -1.8340039253234863, "logits/rejected": -2.798632860183716, "logps/chosen": -138.77374267578125, "logps/rejected": -212.2449951171875, "loss": 0.1111, "rewards/accuracies": 0.8399999737739563, "rewards/chosen": 2.4734740257263184, "rewards/margins": 7.538750171661377, "rewards/rejected": -5.063173770904541, "step": 550 }, { "epoch": 0.12, "grad_norm": 0.1806640625, "learning_rate": 1.996666666666667e-06, "logits/chosen": -1.9733397960662842, "logits/rejected": -2.937753915786743, "logps/chosen": -142.70875549316406, "logps/rejected": -223.96624755859375, "loss": 0.0953, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 2.598142147064209, "rewards/margins": 8.3950777053833, "rewards/rejected": -5.798213005065918, "step": 600 }, { "epoch": 0.13, "grad_norm": 0.2265625, "learning_rate": 2.1633333333333335e-06, "logits/chosen": -2.0656299591064453, "logits/rejected": -3.027539014816284, "logps/chosen": -142.24249267578125, "logps/rejected": -230.47250366210938, "loss": 0.0883, "rewards/accuracies": 0.8725000023841858, "rewards/chosen": 2.734023332595825, "rewards/margins": 9.052812576293945, "rewards/rejected": -6.316962718963623, "step": 650 }, { "epoch": 0.14, "grad_norm": 0.30859375, "learning_rate": 2.33e-06, "logits/chosen": -1.9706201553344727, "logits/rejected": -2.9912109375, "logps/chosen": -138.24249267578125, "logps/rejected": -228.1925048828125, "loss": 0.09, "rewards/accuracies": 0.8700000047683716, "rewards/chosen": 2.9716455936431885, "rewards/margins": 9.053828239440918, "rewards/rejected": -6.082441329956055, "step": 700 }, { "epoch": 0.15, "grad_norm": 0.01446533203125, "learning_rate": 2.4966666666666668e-06, "logits/chosen": -1.9857901334762573, "logits/rejected": -2.991640567779541, "logps/chosen": -134.00999450683594, "logps/rejected": -224.47625732421875, "loss": 0.0848, "rewards/accuracies": 0.8774999976158142, "rewards/chosen": 3.2274608612060547, "rewards/margins": 9.318828582763672, "rewards/rejected": -6.091513633728027, "step": 750 }, { "epoch": 0.16, "grad_norm": 0.15625, "learning_rate": 2.6633333333333334e-06, "logits/chosen": -1.9683300256729126, "logits/rejected": -3.0144922733306885, "logps/chosen": -142.32000732421875, "logps/rejected": -238.3975067138672, "loss": 0.071, "rewards/accuracies": 0.8974999785423279, "rewards/chosen": 3.2487499713897705, "rewards/margins": 9.865859031677246, "rewards/rejected": -6.616034984588623, "step": 800 }, { "epoch": 0.17, "grad_norm": 0.00604248046875, "learning_rate": 2.83e-06, "logits/chosen": -2.0511231422424316, "logits/rejected": -3.087148427963257, "logps/chosen": -138.8387451171875, "logps/rejected": -237.4425048828125, "loss": 0.0813, "rewards/accuracies": 0.8824999928474426, "rewards/chosen": 3.2463672161102295, "rewards/margins": 10.090624809265137, "rewards/rejected": -6.84329080581665, "step": 850 }, { "epoch": 0.18, "grad_norm": 0.01470947265625, "learning_rate": 2.996666666666667e-06, "logits/chosen": -2.1478612422943115, "logits/rejected": -3.173710823059082, "logps/chosen": -143.4824981689453, "logps/rejected": -246.1737518310547, "loss": 0.0726, "rewards/accuracies": 0.8949999809265137, "rewards/chosen": 3.197275400161743, "rewards/margins": 10.487030982971191, "rewards/rejected": -7.29049825668335, "step": 900 }, { "epoch": 0.19, "grad_norm": 0.0067138671875, "learning_rate": 3.1633333333333337e-06, "logits/chosen": -1.9968359470367432, "logits/rejected": -3.075078010559082, "logps/chosen": -136.73875427246094, "logps/rejected": -234.65499877929688, "loss": 0.0813, "rewards/accuracies": 0.8824999928474426, "rewards/chosen": 3.270380973815918, "rewards/margins": 10.061562538146973, "rewards/rejected": -6.791113376617432, "step": 950 }, { "epoch": 0.2, "grad_norm": 0.055908203125, "learning_rate": 3.3300000000000003e-06, "logits/chosen": -2.032480478286743, "logits/rejected": -3.120234489440918, "logps/chosen": -141.06625366210938, "logps/rejected": -247.29124450683594, "loss": 0.0813, "rewards/accuracies": 0.8824999928474426, "rewards/chosen": 3.2919139862060547, "rewards/margins": 10.692343711853027, "rewards/rejected": -7.398417949676514, "step": 1000 }, { "epoch": 0.21, "grad_norm": 0.017822265625, "learning_rate": 3.496666666666667e-06, "logits/chosen": -2.1911962032318115, "logits/rejected": -3.1728124618530273, "logps/chosen": -143.2687530517578, "logps/rejected": -246.8125, "loss": 0.0847, "rewards/accuracies": 0.8774999976158142, "rewards/chosen": 3.0637621879577637, "rewards/margins": 10.748281478881836, "rewards/rejected": -7.685234546661377, "step": 1050 }, { "epoch": 0.22, "grad_norm": 0.060546875, "learning_rate": 3.6633333333333336e-06, "logits/chosen": -2.195253849029541, "logits/rejected": -3.108535051345825, "logps/chosen": -134.12750244140625, "logps/rejected": -233.7412567138672, "loss": 0.1124, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 2.9126367568969727, "rewards/margins": 10.165234565734863, "rewards/rejected": -7.252851486206055, "step": 1100 }, { "epoch": 0.23, "grad_norm": 0.00811767578125, "learning_rate": 3.830000000000001e-06, "logits/chosen": -2.157832145690918, "logits/rejected": -3.157460927963257, "logps/chosen": -140.08250427246094, "logps/rejected": -249.22000122070312, "loss": 0.0709, "rewards/accuracies": 0.8974999785423279, "rewards/chosen": 3.393681526184082, "rewards/margins": 11.291093826293945, "rewards/rejected": -7.8983154296875, "step": 1150 }, { "epoch": 0.24, "grad_norm": 0.37109375, "learning_rate": 3.996666666666667e-06, "logits/chosen": -2.176523447036743, "logits/rejected": -3.145156145095825, "logps/chosen": -136.72625732421875, "logps/rejected": -240.5812530517578, "loss": 0.0847, "rewards/accuracies": 0.8774999976158142, "rewards/chosen": 3.007997989654541, "rewards/margins": 10.645390510559082, "rewards/rejected": -7.636513710021973, "step": 1200 }, { "epoch": 0.25, "grad_norm": 0.0303955078125, "learning_rate": 4.163333333333334e-06, "logits/chosen": -2.346796989440918, "logits/rejected": -3.239375114440918, "logps/chosen": -141.64999389648438, "logps/rejected": -254.55624389648438, "loss": 0.0709, "rewards/accuracies": 0.8974999785423279, "rewards/chosen": 3.1400487422943115, "rewards/margins": 11.464765548706055, "rewards/rejected": -8.323661804199219, "step": 1250 }, { "epoch": 0.26, "grad_norm": 0.8828125, "learning_rate": 4.33e-06, "logits/chosen": -2.233154296875, "logits/rejected": -3.121083974838257, "logps/chosen": -137.37875366210938, "logps/rejected": -247.23875427246094, "loss": 0.0882, "rewards/accuracies": 0.8725000023841858, "rewards/chosen": 2.990058660507202, "rewards/margins": 11.343280792236328, "rewards/rejected": -8.351679801940918, "step": 1300 }, { "epoch": 0.27, "grad_norm": 0.0081787109375, "learning_rate": 4.496666666666667e-06, "logits/chosen": -2.3371288776397705, "logits/rejected": -3.1724023818969727, "logps/chosen": -142.37249755859375, "logps/rejected": -257.1512451171875, "loss": 0.0519, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.159780263900757, "rewards/margins": 11.788749694824219, "rewards/rejected": -8.627890586853027, "step": 1350 }, { "epoch": 0.28, "grad_norm": 1.15625, "learning_rate": 4.663333333333333e-06, "logits/chosen": -2.136476993560791, "logits/rejected": -3.0350780487060547, "logps/chosen": -141.83624267578125, "logps/rejected": -258.1824951171875, "loss": 0.0882, "rewards/accuracies": 0.8725000023841858, "rewards/chosen": 2.5560545921325684, "rewards/margins": 11.789999961853027, "rewards/rejected": -9.23446273803711, "step": 1400 }, { "epoch": 0.29, "grad_norm": 0.001434326171875, "learning_rate": 4.83e-06, "logits/chosen": -2.327712297439575, "logits/rejected": -3.039235830307007, "logps/chosen": -151.52749633789062, "logps/rejected": -260.1824951171875, "loss": 0.0916, "rewards/accuracies": 0.8675000071525574, "rewards/chosen": 1.725927710533142, "rewards/margins": 11.106093406677246, "rewards/rejected": -9.376816749572754, "step": 1450 }, { "epoch": 0.3, "grad_norm": 0.000873565673828125, "learning_rate": 4.9966666666666665e-06, "logits/chosen": -2.236884832382202, "logits/rejected": -2.9620020389556885, "logps/chosen": -152.15249633789062, "logps/rejected": -266.6524963378906, "loss": 0.083, "rewards/accuracies": 0.8799999952316284, "rewards/chosen": 1.9792224168777466, "rewards/margins": 11.753125190734863, "rewards/rejected": -9.775390625, "step": 1500 }, { "epoch": 0.31, "grad_norm": 1.1484375, "learning_rate": 4.99983747144442e-06, "logits/chosen": -2.1749095916748047, "logits/rejected": -3.0108959674835205, "logps/chosen": -158.8925018310547, "logps/rejected": -274.04998779296875, "loss": 0.0795, "rewards/accuracies": 0.8849999904632568, "rewards/chosen": 1.5659692287445068, "rewards/margins": 11.828437805175781, "rewards/rejected": -10.266562461853027, "step": 1550 }, { "epoch": 0.32, "grad_norm": 0.00445556640625, "learning_rate": 4.999336572604176e-06, "logits/chosen": -2.3599512577056885, "logits/rejected": -2.979990243911743, "logps/chosen": -150.02499389648438, "logps/rejected": -264.9649963378906, "loss": 0.0934, "rewards/accuracies": 0.8650000095367432, "rewards/chosen": 1.7431994676589966, "rewards/margins": 11.767656326293945, "rewards/rejected": -10.0248441696167, "step": 1600 }, { "epoch": 0.33, "grad_norm": 0.07763671875, "learning_rate": 4.998497303600191e-06, "logits/chosen": -2.491284132003784, "logits/rejected": -3.098569393157959, "logps/chosen": -151.4949951171875, "logps/rejected": -261.3475036621094, "loss": 0.0916, "rewards/accuracies": 0.8675000071525574, "rewards/chosen": 1.6169995069503784, "rewards/margins": 11.283594131469727, "rewards/rejected": -9.667499542236328, "step": 1650 }, { "epoch": 0.34, "grad_norm": 0.001434326171875, "learning_rate": 4.997319778056057e-06, "logits/chosen": -2.4677734375, "logits/rejected": -3.164921760559082, "logps/chosen": -160.6125030517578, "logps/rejected": -291.8324890136719, "loss": 0.0779, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 1.3262377977371216, "rewards/margins": 13.2251558303833, "rewards/rejected": -11.900468826293945, "step": 1700 }, { "epoch": 0.35, "grad_norm": 0.0142822265625, "learning_rate": 4.995804155389881e-06, "logits/chosen": -2.2496564388275146, "logits/rejected": -2.910781145095825, "logps/chosen": -163.3000030517578, "logps/rejected": -282.75, "loss": 0.0899, "rewards/accuracies": 0.8700000047683716, "rewards/chosen": 1.067590355873108, "rewards/margins": 12.181406021118164, "rewards/rejected": -11.118515968322754, "step": 1750 }, { "epoch": 0.36, "grad_norm": 1.4921875, "learning_rate": 4.9939506407927115e-06, "logits/chosen": -2.496284246444702, "logits/rejected": -3.016840934753418, "logps/chosen": -165.72999572753906, "logps/rejected": -291.8699951171875, "loss": 0.0812, "rewards/accuracies": 0.8824999928474426, "rewards/chosen": 0.7725061178207397, "rewards/margins": 12.811249732971191, "rewards/rejected": -12.039375305175781, "step": 1800 }, { "epoch": 0.37, "grad_norm": 0.0125732421875, "learning_rate": 4.991759485200754e-06, "logits/chosen": -2.5551586151123047, "logits/rejected": -2.9110498428344727, "logps/chosen": -162.89500427246094, "logps/rejected": -287.5174865722656, "loss": 0.0951, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.4655786156654358, "rewards/margins": 12.595937728881836, "rewards/rejected": -12.12906265258789, "step": 1850 }, { "epoch": 0.38, "grad_norm": 1.140625, "learning_rate": 4.989230985261403e-06, "logits/chosen": -2.6131153106689453, "logits/rejected": -2.968681573867798, "logps/chosen": -168.32749938964844, "logps/rejected": -289.5, "loss": 0.0795, "rewards/accuracies": 0.8849999904632568, "rewards/chosen": 0.41573241353034973, "rewards/margins": 12.315781593322754, "rewards/rejected": -11.899062156677246, "step": 1900 }, { "epoch": 0.39, "grad_norm": 0.0218505859375, "learning_rate": 4.986365483293072e-06, "logits/chosen": -2.2319605350494385, "logits/rejected": -2.6794629096984863, "logps/chosen": -165.3937530517578, "logps/rejected": -280.6925048828125, "loss": 0.0832, "rewards/accuracies": 0.8799999952316284, "rewards/chosen": 0.6209277510643005, "rewards/margins": 11.815312385559082, "rewards/rejected": -11.191679954528809, "step": 1950 }, { "epoch": 0.4, "grad_norm": 1.2578125, "learning_rate": 4.9831633672388605e-06, "logits/chosen": -2.148073673248291, "logits/rejected": -2.8818554878234863, "logps/chosen": -156.14500427246094, "logps/rejected": -290.29998779296875, "loss": 0.0674, "rewards/accuracies": 0.9024999737739563, "rewards/chosen": 1.61712646484375, "rewards/margins": 13.622812271118164, "rewards/rejected": -12.008906364440918, "step": 2000 }, { "epoch": 0.41, "grad_norm": 0.00494384765625, "learning_rate": 4.979625070614023e-06, "logits/chosen": -1.928134799003601, "logits/rejected": -2.771103620529175, "logps/chosen": -149.2050018310547, "logps/rejected": -288.3500061035156, "loss": 0.0674, "rewards/accuracies": 0.9024999737739563, "rewards/chosen": 2.3412108421325684, "rewards/margins": 14.15218734741211, "rewards/rejected": -11.807969093322754, "step": 2050 }, { "epoch": 0.42, "grad_norm": 0.047607421875, "learning_rate": 4.975751072447283e-06, "logits/chosen": -1.7242242097854614, "logits/rejected": -2.762929677963257, "logps/chosen": -138.24374389648438, "logps/rejected": -269.7225036621094, "loss": 0.0622, "rewards/accuracies": 0.9100000262260437, "rewards/chosen": 3.4145116806030273, "rewards/margins": 13.523906707763672, "rewards/rejected": -10.108515739440918, "step": 2100 }, { "epoch": 0.43, "grad_norm": 0.93359375, "learning_rate": 4.9715418972159794e-06, "logits/chosen": -1.8216897249221802, "logits/rejected": -2.836932420730591, "logps/chosen": -136.2324981689453, "logps/rejected": -263.7225036621094, "loss": 0.102, "rewards/accuracies": 0.8525000214576721, "rewards/chosen": 3.042714834213257, "rewards/margins": 12.95718765258789, "rewards/rejected": -9.909453392028809, "step": 2150 }, { "epoch": 0.44, "grad_norm": 0.000362396240234375, "learning_rate": 4.96699811477506e-06, "logits/chosen": -2.153923273086548, "logits/rejected": -3.0781617164611816, "logps/chosen": -139.92250061035156, "logps/rejected": -269.2449951171875, "loss": 0.0882, "rewards/accuracies": 0.8725000023841858, "rewards/chosen": 2.6978321075439453, "rewards/margins": 13.111719131469727, "rewards/rejected": -10.4213285446167, "step": 2200 }, { "epoch": 0.45, "grad_norm": 0.00933837890625, "learning_rate": 4.962120340279933e-06, "logits/chosen": -2.206913948059082, "logits/rejected": -3.013432502746582, "logps/chosen": -149.11500549316406, "logps/rejected": -284.74749755859375, "loss": 0.0778, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 2.5205469131469727, "rewards/margins": 13.883281707763672, "rewards/rejected": -11.363203048706055, "step": 2250 }, { "epoch": 0.46, "grad_norm": 0.007232666015625, "learning_rate": 4.956909234103184e-06, "logits/chosen": -2.232741594314575, "logits/rejected": -2.846698045730591, "logps/chosen": -147.88999938964844, "logps/rejected": -290.8949890136719, "loss": 0.0691, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 2.5749120712280273, "rewards/margins": 14.527656555175781, "rewards/rejected": -11.950780868530273, "step": 2300 }, { "epoch": 0.47, "grad_norm": 0.6015625, "learning_rate": 4.951365501745172e-06, "logits/chosen": -2.397939443588257, "logits/rejected": -2.8929004669189453, "logps/chosen": -144.75750732421875, "logps/rejected": -275.7850036621094, "loss": 0.0899, "rewards/accuracies": 0.8700000047683716, "rewards/chosen": 2.490410089492798, "rewards/margins": 13.264843940734863, "rewards/rejected": -10.7734375, "step": 2350 }, { "epoch": 0.48, "grad_norm": 0.02099609375, "learning_rate": 4.945489893738518e-06, "logits/chosen": -2.36920166015625, "logits/rejected": -2.889394521713257, "logps/chosen": -136.7274932861328, "logps/rejected": -272.572509765625, "loss": 0.0951, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 2.901992082595825, "rewards/margins": 13.673593521118164, "rewards/rejected": -10.7650785446167, "step": 2400 }, { "epoch": 0.49, "grad_norm": 0.007110595703125, "learning_rate": 4.93928320554649e-06, "logits/chosen": -1.9525684118270874, "logits/rejected": -2.8195831775665283, "logps/chosen": -135.28875732421875, "logps/rejected": -266.6099853515625, "loss": 0.0882, "rewards/accuracies": 0.8725000023841858, "rewards/chosen": 3.462538957595825, "rewards/margins": 13.346015930175781, "rewards/rejected": -9.8804292678833, "step": 2450 }, { "epoch": 0.5, "grad_norm": 0.94140625, "learning_rate": 4.932746277455317e-06, "logits/chosen": -1.662766456604004, "logits/rejected": -2.691631555557251, "logps/chosen": -133.22000122070312, "logps/rejected": -272.822509765625, "loss": 0.1037, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 3.5004687309265137, "rewards/margins": 14.248437881469727, "rewards/rejected": -10.7514066696167, "step": 2500 }, { "epoch": 0.51, "grad_norm": 0.00982666015625, "learning_rate": 4.9258799944604215e-06, "logits/chosen": -1.7904162406921387, "logits/rejected": -2.6735968589782715, "logps/chosen": -132.4512481689453, "logps/rejected": -281.63751220703125, "loss": 0.0743, "rewards/accuracies": 0.8924999833106995, "rewards/chosen": 3.9552342891693115, "rewards/margins": 15.276874542236328, "rewards/rejected": -11.324609756469727, "step": 2550 }, { "epoch": 0.52, "grad_norm": 0.013916015625, "learning_rate": 4.918685286146611e-06, "logits/chosen": -1.7311677932739258, "logits/rejected": -2.691605567932129, "logps/chosen": -132.29249572753906, "logps/rejected": -285.6025085449219, "loss": 0.0834, "rewards/accuracies": 0.8799999952316284, "rewards/chosen": 3.967148542404175, "rewards/margins": 15.4975004196167, "rewards/rejected": -11.534062385559082, "step": 2600 }, { "epoch": 0.53, "grad_norm": 0.005035400390625, "learning_rate": 4.911163126562218e-06, "logits/chosen": -1.7594763040542603, "logits/rejected": -2.734794855117798, "logps/chosen": -134.93499755859375, "logps/rejected": -289.4525146484375, "loss": 0.0933, "rewards/accuracies": 0.8650000095367432, "rewards/chosen": 3.4932031631469727, "rewards/margins": 15.765155792236328, "rewards/rejected": -12.267890930175781, "step": 2650 }, { "epoch": 0.54, "grad_norm": 0.00014495849609375, "learning_rate": 4.903314534087243e-06, "logits/chosen": -1.64252507686615, "logits/rejected": -2.694523811340332, "logps/chosen": -137.2375030517578, "logps/rejected": -277.3800048828125, "loss": 0.0743, "rewards/accuracies": 0.8924999833106995, "rewards/chosen": 3.0264551639556885, "rewards/margins": 14.148124694824219, "rewards/rejected": -11.121874809265137, "step": 2700 }, { "epoch": 0.55, "grad_norm": 0.73046875, "learning_rate": 4.895140571295469e-06, "logits/chosen": -1.6052197217941284, "logits/rejected": -2.6878418922424316, "logps/chosen": -130.99374389648438, "logps/rejected": -274.2774963378906, "loss": 0.0933, "rewards/accuracies": 0.8650000095367432, "rewards/chosen": 3.450859308242798, "rewards/margins": 14.58187484741211, "rewards/rejected": -11.129921913146973, "step": 2750 }, { "epoch": 0.56, "grad_norm": 0.001068115234375, "learning_rate": 4.886642344810612e-06, "logits/chosen": -2.1891088485717773, "logits/rejected": -2.992116689682007, "logps/chosen": -131.99749755859375, "logps/rejected": -261.6474914550781, "loss": 0.1072, "rewards/accuracies": 0.8450000286102295, "rewards/chosen": 3.183906316757202, "rewards/margins": 13.224843978881836, "rewards/rejected": -10.036718368530273, "step": 2800 }, { "epoch": 0.57, "grad_norm": 0.0004825592041015625, "learning_rate": 4.877821005156504e-06, "logits/chosen": -2.3619725704193115, "logits/rejected": -3.226367235183716, "logps/chosen": -138.77000427246094, "logps/rejected": -263.8999938964844, "loss": 0.0951, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 3.243339776992798, "rewards/margins": 12.722031593322754, "rewards/rejected": -9.478281021118164, "step": 2850 }, { "epoch": 0.58, "grad_norm": 0.0098876953125, "learning_rate": 4.868677746601325e-06, "logits/chosen": -2.1765477657318115, "logits/rejected": -3.075366258621216, "logps/chosen": -134.6649932861328, "logps/rejected": -251.0850067138672, "loss": 0.102, "rewards/accuracies": 0.8525000214576721, "rewards/chosen": 3.067031145095825, "rewards/margins": 11.9115629196167, "rewards/rejected": -8.846875190734863, "step": 2900 }, { "epoch": 0.59, "grad_norm": 1.1796875, "learning_rate": 4.859213806995924e-06, "logits/chosen": -2.022216796875, "logits/rejected": -3.042172908782959, "logps/chosen": -135.00750732421875, "logps/rejected": -262.7250061035156, "loss": 0.0812, "rewards/accuracies": 0.8824999928474426, "rewards/chosen": 3.4410157203674316, "rewards/margins": 13.021562576293945, "rewards/rejected": -9.582109451293945, "step": 2950 }, { "epoch": 0.6, "grad_norm": 0.0026092529296875, "learning_rate": 4.849430467606228e-06, "logits/chosen": -1.728432059288025, "logits/rejected": -2.991831064224243, "logps/chosen": -129.99374389648438, "logps/rejected": -245.56500244140625, "loss": 0.1003, "rewards/accuracies": 0.8550000190734863, "rewards/chosen": 3.511894464492798, "rewards/margins": 11.818437576293945, "rewards/rejected": -8.303203582763672, "step": 3000 }, { "epoch": 0.61, "grad_norm": 0.00012111663818359375, "learning_rate": 4.839329052939784e-06, "logits/chosen": -1.3972948789596558, "logits/rejected": -2.9753711223602295, "logps/chosen": -133.10374450683594, "logps/rejected": -260.8924865722656, "loss": 0.0778, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 3.828476667404175, "rewards/margins": 12.948124885559082, "rewards/rejected": -9.120469093322754, "step": 3050 }, { "epoch": 0.62, "grad_norm": 0.53515625, "learning_rate": 4.82891093056644e-06, "logits/chosen": -1.142480492591858, "logits/rejected": -2.7538769245147705, "logps/chosen": -126.52874755859375, "logps/rejected": -251.2100067138672, "loss": 0.0985, "rewards/accuracies": 0.8575000166893005, "rewards/chosen": 4.20089864730835, "rewards/margins": 12.569218635559082, "rewards/rejected": -8.369375228881836, "step": 3100 }, { "epoch": 0.63, "grad_norm": 0.0034332275390625, "learning_rate": 4.818177510933194e-06, "logits/chosen": -1.1324691772460938, "logits/rejected": -2.7247815132141113, "logps/chosen": -128.28125, "logps/rejected": -250.6425018310547, "loss": 0.0933, "rewards/accuracies": 0.8650000095367432, "rewards/chosen": 4.047187328338623, "rewards/margins": 12.607500076293945, "rewards/rejected": -8.559687614440918, "step": 3150 }, { "epoch": 0.64, "grad_norm": 0.015380859375, "learning_rate": 4.807130247173252e-06, "logits/chosen": -0.9328582882881165, "logits/rejected": -2.533745050430298, "logps/chosen": -124.90499877929688, "logps/rejected": -245.25250244140625, "loss": 0.0916, "rewards/accuracies": 0.8675000071525574, "rewards/chosen": 4.320508003234863, "rewards/margins": 12.207812309265137, "rewards/rejected": -7.890781402587891, "step": 3200 }, { "epoch": 0.65, "grad_norm": 0.546875, "learning_rate": 4.795770634909287e-06, "logits/chosen": -0.9098020792007446, "logits/rejected": -2.5764403343200684, "logps/chosen": -127.57749938964844, "logps/rejected": -257.6549987792969, "loss": 0.0657, "rewards/accuracies": 0.9049999713897705, "rewards/chosen": 4.306952953338623, "rewards/margins": 13.306875228881836, "rewards/rejected": -8.999530792236328, "step": 3250 }, { "epoch": 0.66, "grad_norm": 0.00775146484375, "learning_rate": 4.784100212050959e-06, "logits/chosen": -1.165554165840149, "logits/rejected": -2.720310688018799, "logps/chosen": -135.34750366210938, "logps/rejected": -268.3550109863281, "loss": 0.0795, "rewards/accuracies": 0.8849999904632568, "rewards/chosen": 3.560781240463257, "rewards/margins": 13.581093788146973, "rewards/rejected": -10.021249771118164, "step": 3300 }, { "epoch": 0.67, "grad_norm": 0.67578125, "learning_rate": 4.772120558586711e-06, "logits/chosen": -1.4726300239562988, "logits/rejected": -2.9082860946655273, "logps/chosen": -136.72500610351562, "logps/rejected": -266.2174987792969, "loss": 0.0864, "rewards/accuracies": 0.875, "rewards/chosen": 3.1438281536102295, "rewards/margins": 13.237656593322754, "rewards/rejected": -10.093280792236328, "step": 3350 }, { "epoch": 0.68, "grad_norm": 0.000457763671875, "learning_rate": 4.759833296369855e-06, "logits/chosen": -1.7112644910812378, "logits/rejected": -2.9018359184265137, "logps/chosen": -144.86500549316406, "logps/rejected": -274.8800048828125, "loss": 0.0864, "rewards/accuracies": 0.875, "rewards/chosen": 2.424466609954834, "rewards/margins": 13.268437385559082, "rewards/rejected": -10.842421531677246, "step": 3400 }, { "epoch": 0.69, "grad_norm": 0.00023937225341796875, "learning_rate": 4.747240088899007e-06, "logits/chosen": -1.8725781440734863, "logits/rejected": -2.871826171875, "logps/chosen": -145.64500427246094, "logps/rejected": -284.7799987792969, "loss": 0.083, "rewards/accuracies": 0.8799999952316284, "rewards/chosen": 2.372001886367798, "rewards/margins": 13.883749961853027, "rewards/rejected": -11.511327743530273, "step": 3450 }, { "epoch": 0.7, "grad_norm": 0.0089111328125, "learning_rate": 4.734342641092873e-06, "logits/chosen": -1.76885986328125, "logits/rejected": -2.764961004257202, "logps/chosen": -133.86749267578125, "logps/rejected": -264.427490234375, "loss": 0.0882, "rewards/accuracies": 0.8725000023841858, "rewards/chosen": 3.250908136367798, "rewards/margins": 13.281562805175781, "rewards/rejected": -10.029687881469727, "step": 3500 }, { "epoch": 0.71, "grad_norm": 0.009765625, "learning_rate": 4.72114269905943e-06, "logits/chosen": -1.7043505907058716, "logits/rejected": -2.7690234184265137, "logps/chosen": -130.1699981689453, "logps/rejected": -268.5350036621094, "loss": 0.0761, "rewards/accuracies": 0.8899999856948853, "rewards/chosen": 3.886406183242798, "rewards/margins": 13.834218978881836, "rewards/rejected": -9.946484565734863, "step": 3550 }, { "epoch": 0.72, "grad_norm": 0.004180908203125, "learning_rate": 4.70764204985953e-06, "logits/chosen": -1.7190561294555664, "logits/rejected": -2.7930660247802734, "logps/chosen": -132.5762481689453, "logps/rejected": -261.9324951171875, "loss": 0.0951, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 3.5982422828674316, "rewards/margins": 13.246718406677246, "rewards/rejected": -9.651249885559082, "step": 3600 }, { "epoch": 0.73, "grad_norm": 0.3984375, "learning_rate": 4.693842521264963e-06, "logits/chosen": -1.6216427087783813, "logits/rejected": -2.808516263961792, "logps/chosen": -126.45999908447266, "logps/rejected": -260.43499755859375, "loss": 0.0968, "rewards/accuracies": 0.8600000143051147, "rewards/chosen": 3.8330078125, "rewards/margins": 13.640625, "rewards/rejected": -9.81070327758789, "step": 3650 }, { "epoch": 0.74, "grad_norm": 0.0030670166015625, "learning_rate": 4.679745981511005e-06, "logits/chosen": -1.497025728225708, "logits/rejected": -2.7889842987060547, "logps/chosen": -127.35624694824219, "logps/rejected": -263.0849914550781, "loss": 0.0812, "rewards/accuracies": 0.8824999928474426, "rewards/chosen": 4.245625019073486, "rewards/margins": 13.9360933303833, "rewards/rejected": -9.694062232971191, "step": 3700 }, { "epoch": 0.75, "grad_norm": 0.4609375, "learning_rate": 4.665354339043487e-06, "logits/chosen": -1.61189603805542, "logits/rejected": -2.818247079849243, "logps/chosen": -125.53874969482422, "logps/rejected": -262.56500244140625, "loss": 0.0882, "rewards/accuracies": 0.8725000023841858, "rewards/chosen": 4.035273551940918, "rewards/margins": 13.862030982971191, "rewards/rejected": -9.828046798706055, "step": 3750 }, { "epoch": 0.76, "grad_norm": 0.005584716796875, "learning_rate": 4.650669542260426e-06, "logits/chosen": -1.8283697366714478, "logits/rejected": -3.02060604095459, "logps/chosen": -132.30624389648438, "logps/rejected": -265.7875061035156, "loss": 0.0882, "rewards/accuracies": 0.8725000023841858, "rewards/chosen": 3.852304697036743, "rewards/margins": 13.544843673706055, "rewards/rejected": -9.695077896118164, "step": 3800 }, { "epoch": 0.77, "grad_norm": 0.006256103515625, "learning_rate": 4.635693579248238e-06, "logits/chosen": -1.77850341796875, "logits/rejected": -2.859311580657959, "logps/chosen": -131.875, "logps/rejected": -267.4624938964844, "loss": 0.0882, "rewards/accuracies": 0.8725000023841858, "rewards/chosen": 3.6142187118530273, "rewards/margins": 13.896875381469727, "rewards/rejected": -10.281562805175781, "step": 3850 }, { "epoch": 0.78, "grad_norm": 0.0023345947265625, "learning_rate": 4.620428477512588e-06, "logits/chosen": -1.6927603483200073, "logits/rejected": -2.8547582626342773, "logps/chosen": -132.7550048828125, "logps/rejected": -273.0150146484375, "loss": 0.0847, "rewards/accuracies": 0.8774999976158142, "rewards/chosen": 3.655820369720459, "rewards/margins": 14.208906173706055, "rewards/rejected": -10.555000305175781, "step": 3900 }, { "epoch": 0.79, "grad_norm": 0.8671875, "learning_rate": 4.604876303703892e-06, "logits/chosen": -1.5046154260635376, "logits/rejected": -2.71980357170105, "logps/chosen": -127.0512466430664, "logps/rejected": -260.4525146484375, "loss": 0.0882, "rewards/accuracies": 0.8725000023841858, "rewards/chosen": 3.969980478286743, "rewards/margins": 13.666093826293945, "rewards/rejected": -9.696015357971191, "step": 3950 }, { "epoch": 0.8, "grad_norm": 0.91796875, "learning_rate": 4.5890391633375345e-06, "logits/chosen": -1.6800073385238647, "logits/rejected": -2.810532331466675, "logps/chosen": -141.1374969482422, "logps/rejected": -285.43499755859375, "loss": 0.0812, "rewards/accuracies": 0.8824999928474426, "rewards/chosen": 3.1419239044189453, "rewards/margins": 14.576562881469727, "rewards/rejected": -11.4341402053833, "step": 4000 }, { "epoch": 0.81, "grad_norm": 1.171875, "learning_rate": 4.572919200508805e-06, "logits/chosen": -1.7100884914398193, "logits/rejected": -2.719179630279541, "logps/chosen": -143.1024932861328, "logps/rejected": -274.3575134277344, "loss": 0.1037, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.4612793922424316, "rewards/margins": 13.33968734741211, "rewards/rejected": -10.880234718322754, "step": 4050 }, { "epoch": 0.82, "grad_norm": 0.78515625, "learning_rate": 4.556518597602633e-06, "logits/chosen": -1.8037463426589966, "logits/rejected": -2.8130078315734863, "logps/chosen": -149.1824951171875, "logps/rejected": -280.9200134277344, "loss": 0.0916, "rewards/accuracies": 0.8675000071525574, "rewards/chosen": 2.325624942779541, "rewards/margins": 13.437812805175781, "rewards/rejected": -11.111093521118164, "step": 4100 }, { "epoch": 0.83, "grad_norm": 0.00099945068359375, "learning_rate": 4.539839574998117e-06, "logits/chosen": -1.8497778177261353, "logits/rejected": -2.873305559158325, "logps/chosen": -151.7624969482422, "logps/rejected": -289.9024963378906, "loss": 0.0847, "rewards/accuracies": 0.8774999976158142, "rewards/chosen": 2.024599552154541, "rewards/margins": 14.141249656677246, "rewards/rejected": -12.12093734741211, "step": 4150 }, { "epoch": 0.84, "grad_norm": 5.90625, "learning_rate": 4.522884390767928e-06, "logits/chosen": -2.027651309967041, "logits/rejected": -2.663203239440918, "logps/chosen": -154.8524932861328, "logps/rejected": -288.4624938964844, "loss": 0.1054, "rewards/accuracies": 0.8475000262260437, "rewards/chosen": 0.9060644507408142, "rewards/margins": 13.541406631469727, "rewards/rejected": -12.634062767028809, "step": 4200 }, { "epoch": 0.85, "grad_norm": 0.671875, "learning_rate": 4.5056553403726014e-06, "logits/chosen": -1.9349523782730103, "logits/rejected": -2.789375066757202, "logps/chosen": -148.86500549316406, "logps/rejected": -268.6025085449219, "loss": 0.0847, "rewards/accuracies": 0.8774999976158142, "rewards/chosen": 2.1701366901397705, "rewards/margins": 12.2095308303833, "rewards/rejected": -10.041191101074219, "step": 4250 }, { "epoch": 0.86, "grad_norm": 5.78125, "learning_rate": 4.488154756349765e-06, "logits/chosen": -1.8653271198272705, "logits/rejected": -2.8621387481689453, "logps/chosen": -143.14625549316406, "logps/rejected": -274.0362548828125, "loss": 0.0882, "rewards/accuracies": 0.8725000023841858, "rewards/chosen": 2.540623664855957, "rewards/margins": 13.264374732971191, "rewards/rejected": -10.722421646118164, "step": 4300 }, { "epoch": 0.87, "grad_norm": 0.65234375, "learning_rate": 4.470385007998354e-06, "logits/chosen": -2.208327054977417, "logits/rejected": -3.1323602199554443, "logps/chosen": -146.74249267578125, "logps/rejected": -301.1075134277344, "loss": 0.0968, "rewards/accuracies": 0.8600000143051147, "rewards/chosen": 2.182009220123291, "rewards/margins": 15.532031059265137, "rewards/rejected": -13.352360725402832, "step": 4350 }, { "epoch": 0.88, "grad_norm": 0.93359375, "learning_rate": 4.452348501057847e-06, "logits/chosen": -2.0818140506744385, "logits/rejected": -3.202207088470459, "logps/chosen": -157.0050048828125, "logps/rejected": -315.1000061035156, "loss": 0.0951, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 1.374355435371399, "rewards/margins": 15.971718788146973, "rewards/rejected": -14.5912504196167, "step": 4400 }, { "epoch": 0.89, "grad_norm": 0.0002765655517578125, "learning_rate": 4.434047677382563e-06, "logits/chosen": -1.5668798685073853, "logits/rejected": -2.893112897872925, "logps/chosen": -147.7725067138672, "logps/rejected": -312.5950012207031, "loss": 0.0916, "rewards/accuracies": 0.8675000071525574, "rewards/chosen": 1.8955810070037842, "rewards/margins": 16.66828155517578, "rewards/rejected": -14.76968765258789, "step": 4450 }, { "epoch": 0.9, "grad_norm": 7.724761962890625e-05, "learning_rate": 4.415485014611076e-06, "logits/chosen": -1.6306848526000977, "logits/rejected": -3.0001611709594727, "logps/chosen": -143.4774932861328, "logps/rejected": -301.7674865722656, "loss": 0.1037, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.233813524246216, "rewards/margins": 16.112031936645508, "rewards/rejected": -13.87906265258789, "step": 4500 }, { "epoch": 0.91, "grad_norm": 1.3984375, "learning_rate": 4.396663025830785e-06, "logits/chosen": -1.5060467720031738, "logits/rejected": -3.0440375804901123, "logps/chosen": -146.45249938964844, "logps/rejected": -308.010009765625, "loss": 0.0882, "rewards/accuracies": 0.8725000023841858, "rewards/chosen": 2.640126943588257, "rewards/margins": 16.458906173706055, "rewards/rejected": -13.8149995803833, "step": 4550 }, { "epoch": 0.92, "grad_norm": 0.00164031982421875, "learning_rate": 4.377584259237676e-06, "logits/chosen": -1.5768399238586426, "logits/rejected": -3.0472412109375, "logps/chosen": -144.15249633789062, "logps/rejected": -300.93499755859375, "loss": 0.0951, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 2.490654230117798, "rewards/margins": 16.099843978881836, "rewards/rejected": -13.607500076293945, "step": 4600 }, { "epoch": 0.93, "grad_norm": 0.012451171875, "learning_rate": 4.358251297791342e-06, "logits/chosen": -1.487817406654358, "logits/rejected": -3.055607795715332, "logps/chosen": -156.3074951171875, "logps/rejected": -315.9574890136719, "loss": 0.0743, "rewards/accuracies": 0.8924999833106995, "rewards/chosen": 1.7694629430770874, "rewards/margins": 16.351093292236328, "rewards/rejected": -14.579843521118164, "step": 4650 }, { "epoch": 0.94, "grad_norm": 0.8359375, "learning_rate": 4.338666758865291e-06, "logits/chosen": -1.5760504007339478, "logits/rejected": -3.079218864440918, "logps/chosen": -161.78250122070312, "logps/rejected": -324.3299865722656, "loss": 0.0778, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.9159045219421387, "rewards/margins": 16.36734390258789, "rewards/rejected": -15.450937271118164, "step": 4700 }, { "epoch": 0.95, "grad_norm": 0.000255584716796875, "learning_rate": 4.318833293892593e-06, "logits/chosen": -1.5287631750106812, "logits/rejected": -2.8774261474609375, "logps/chosen": -165.83999633789062, "logps/rejected": -315.05499267578125, "loss": 0.0916, "rewards/accuracies": 0.8675000071525574, "rewards/chosen": 0.16832397878170013, "rewards/margins": 15.089219093322754, "rewards/rejected": -14.9165620803833, "step": 4750 }, { "epoch": 0.96, "grad_norm": 0.82421875, "learning_rate": 4.2987535880069194e-06, "logits/chosen": -1.7194920778274536, "logits/rejected": -3.0331249237060547, "logps/chosen": -173.74000549316406, "logps/rejected": -324.07501220703125, "loss": 0.0864, "rewards/accuracies": 0.875, "rewards/chosen": 0.08797118812799454, "rewards/margins": 15.2670316696167, "rewards/rejected": -15.170312881469727, "step": 4800 }, { "epoch": 0.97, "grad_norm": 0.68359375, "learning_rate": 4.278430359679022e-06, "logits/chosen": -1.6618307828903198, "logits/rejected": -2.7870311737060547, "logps/chosen": -171.0500030517578, "logps/rejected": -332.36248779296875, "loss": 0.0761, "rewards/accuracies": 0.8899999856948853, "rewards/chosen": -0.015126952901482582, "rewards/margins": 16.22640609741211, "rewards/rejected": -16.239219665527344, "step": 4850 }, { "epoch": 0.98, "grad_norm": 0.0081787109375, "learning_rate": 4.2578663603486916e-06, "logits/chosen": -1.4954382181167603, "logits/rejected": -2.641606330871582, "logps/chosen": -165.57749938964844, "logps/rejected": -321.75, "loss": 0.1158, "rewards/accuracies": 0.8324999809265137, "rewards/chosen": -0.45833495259284973, "rewards/margins": 15.786406517028809, "rewards/rejected": -16.246719360351562, "step": 4900 }, { "epoch": 0.99, "grad_norm": 0.009765625, "learning_rate": 4.23706437405226e-06, "logits/chosen": -1.6313989162445068, "logits/rejected": -2.8603711128234863, "logps/chosen": -175.71499633789062, "logps/rejected": -339.1875, "loss": 0.0812, "rewards/accuracies": 0.8824999928474426, "rewards/chosen": 0.04604126140475273, "rewards/margins": 16.614530563354492, "rewards/rejected": -16.570938110351562, "step": 4950 }, { "epoch": 1.0, "grad_norm": 0.0008697509765625, "learning_rate": 4.21602721704568e-06, "logits/chosen": -1.7376703023910522, "logits/rejected": -2.959348678588867, "logps/chosen": -163.42250061035156, "logps/rejected": -324.82000732421875, "loss": 0.0812, "rewards/accuracies": 0.8824999928474426, "rewards/chosen": 0.5958447456359863, "rewards/margins": 16.27703094482422, "rewards/rejected": -15.682969093322754, "step": 5000 }, { "epoch": 1.01, "grad_norm": 0.00130462646484375, "learning_rate": 4.194757737423261e-06, "logits/chosen": -1.9893441200256348, "logits/rejected": -3.1228907108306885, "logps/chosen": -161.6875, "logps/rejected": -316.3900146484375, "loss": 0.083, "rewards/accuracies": 0.8799999952316284, "rewards/chosen": 0.8289526104927063, "rewards/margins": 15.733905792236328, "rewards/rejected": -14.90640640258789, "step": 5050 }, { "epoch": 1.02, "grad_norm": 0.003387451171875, "learning_rate": 4.1732588147320705e-06, "logits/chosen": -1.9210351705551147, "logits/rejected": -3.0717577934265137, "logps/chosen": -151.19000244140625, "logps/rejected": -294.67999267578125, "loss": 0.1054, "rewards/accuracies": 0.8475000262260437, "rewards/chosen": 0.8471630811691284, "rewards/margins": 14.576250076293945, "rewards/rejected": -13.729218482971191, "step": 5100 }, { "epoch": 1.03, "grad_norm": 0.91015625, "learning_rate": 4.1515333595820975e-06, "logits/chosen": -1.982885718345642, "logits/rejected": -3.0044140815734863, "logps/chosen": -163.9550018310547, "logps/rejected": -320.55999755859375, "loss": 0.0916, "rewards/accuracies": 0.8675000071525574, "rewards/chosen": 0.7394506931304932, "rewards/margins": 15.829375267028809, "rewards/rejected": -15.094843864440918, "step": 5150 }, { "epoch": 1.04, "grad_norm": 9.393692016601562e-05, "learning_rate": 4.129584313252198e-06, "logits/chosen": -1.7410473823547363, "logits/rejected": -2.6847314834594727, "logps/chosen": -158.25999450683594, "logps/rejected": -317.07000732421875, "loss": 0.0899, "rewards/accuracies": 0.8700000047683716, "rewards/chosen": 1.0339257717132568, "rewards/margins": 15.897500038146973, "rewards/rejected": -14.865156173706055, "step": 5200 }, { "epoch": 1.05, "grad_norm": 0.8984375, "learning_rate": 4.107414647291893e-06, "logits/chosen": -1.84354829788208, "logits/rejected": -2.650254011154175, "logps/chosen": -160.79750061035156, "logps/rejected": -312.51251220703125, "loss": 0.0899, "rewards/accuracies": 0.8700000047683716, "rewards/chosen": 1.0474340915679932, "rewards/margins": 15.432499885559082, "rewards/rejected": -14.383749961853027, "step": 5250 }, { "epoch": 1.06, "grad_norm": 0.0002899169921875, "learning_rate": 4.085027363119076e-06, "logits/chosen": -1.7570642232894897, "logits/rejected": -2.6784496307373047, "logps/chosen": -159.75750732421875, "logps/rejected": -307.239990234375, "loss": 0.0985, "rewards/accuracies": 0.8575000166893005, "rewards/chosen": 0.8954675197601318, "rewards/margins": 15.072968482971191, "rewards/rejected": -14.178437232971191, "step": 5300 }, { "epoch": 1.07, "grad_norm": 0.0067138671875, "learning_rate": 4.062425491613656e-06, "logits/chosen": -1.6061410903930664, "logits/rejected": -2.5976123809814453, "logps/chosen": -157.30250549316406, "logps/rejected": -299.4425048828125, "loss": 0.0968, "rewards/accuracies": 0.8600000143051147, "rewards/chosen": 0.9188379049301147, "rewards/margins": 14.39859390258789, "rewards/rejected": -13.4799222946167, "step": 5350 }, { "epoch": 1.08, "grad_norm": 2.578125, "learning_rate": 4.039612092707236e-06, "logits/chosen": -1.7808789014816284, "logits/rejected": -2.769533634185791, "logps/chosen": -161.10499572753906, "logps/rejected": -301.5849914550781, "loss": 0.0847, "rewards/accuracies": 0.8774999976158142, "rewards/chosen": 0.6963549852371216, "rewards/margins": 14.181875228881836, "rewards/rejected": -13.483905792236328, "step": 5400 }, { "epoch": 1.09, "grad_norm": 1.0, "learning_rate": 4.016590254968842e-06, "logits/chosen": -1.786810278892517, "logits/rejected": -2.6385669708251953, "logps/chosen": -168.65750122070312, "logps/rejected": -320.4674987792969, "loss": 0.0795, "rewards/accuracies": 0.8849999904632568, "rewards/chosen": 0.3265624940395355, "rewards/margins": 15.369843482971191, "rewards/rejected": -15.044530868530273, "step": 5450 }, { "epoch": 1.1, "grad_norm": 0.000202178955078125, "learning_rate": 3.993363095186781e-06, "logits/chosen": -1.775051236152649, "logits/rejected": -2.604965925216675, "logps/chosen": -175.4250030517578, "logps/rejected": -334.4700012207031, "loss": 0.083, "rewards/accuracies": 0.8799999952316284, "rewards/chosen": -0.4224267601966858, "rewards/margins": 16.112499237060547, "rewards/rejected": -16.530624389648438, "step": 5500 }, { "epoch": 1.11, "grad_norm": 0.0089111328125, "learning_rate": 3.9699337579466765e-06, "logits/chosen": -1.4332165718078613, "logits/rejected": -2.4917151927948, "logps/chosen": -174.8524932861328, "logps/rejected": -330.2449951171875, "loss": 0.0916, "rewards/accuracies": 0.8675000071525574, "rewards/chosen": -0.360107421875, "rewards/margins": 15.817968368530273, "rewards/rejected": -16.182968139648438, "step": 5550 }, { "epoch": 1.12, "grad_norm": 0.000827789306640625, "learning_rate": 3.946305415205748e-06, "logits/chosen": -1.3821977376937866, "logits/rejected": -2.346054792404175, "logps/chosen": -173.88250732421875, "logps/rejected": -325.8175048828125, "loss": 0.1003, "rewards/accuracies": 0.8550000190734863, "rewards/chosen": -0.9856836199760437, "rewards/margins": 15.3203125, "rewards/rejected": -16.30843734741211, "step": 5600 }, { "epoch": 1.13, "grad_norm": 1.171875, "learning_rate": 3.922481265863371e-06, "logits/chosen": -1.5632950067520142, "logits/rejected": -2.638700008392334, "logps/chosen": -176.65750122070312, "logps/rejected": -339.3949890136719, "loss": 0.0812, "rewards/accuracies": 0.8824999928474426, "rewards/chosen": -0.8098046779632568, "rewards/margins": 16.346094131469727, "rewards/rejected": -17.149063110351562, "step": 5650 }, { "epoch": 1.1400000000000001, "grad_norm": 0.8984375, "learning_rate": 3.898464535327997e-06, "logits/chosen": -1.559891700744629, "logits/rejected": -2.6719970703125, "logps/chosen": -175.10499572753906, "logps/rejected": -332.5799865722656, "loss": 0.0847, "rewards/accuracies": 0.8774999976158142, "rewards/chosen": -0.6723046898841858, "rewards/margins": 16.07062530517578, "rewards/rejected": -16.74625015258789, "step": 5700 }, { "epoch": 1.15, "grad_norm": 1.52587890625e-05, "learning_rate": 3.874258475080497e-06, "logits/chosen": -1.848596215248108, "logits/rejected": -2.9883642196655273, "logps/chosen": -171.56500244140625, "logps/rejected": -336.0675048828125, "loss": 0.0812, "rewards/accuracies": 0.8824999928474426, "rewards/chosen": -0.05261474475264549, "rewards/margins": 16.615468978881836, "rewards/rejected": -16.658750534057617, "step": 5750 }, { "epoch": 1.16, "grad_norm": 0.0010528564453125, "learning_rate": 3.849866362233947e-06, "logits/chosen": -1.624019742012024, "logits/rejected": -2.9648828506469727, "logps/chosen": -168.875, "logps/rejected": -331.17999267578125, "loss": 0.0726, "rewards/accuracies": 0.8949999809265137, "rewards/chosen": 0.45997437834739685, "rewards/margins": 16.332500457763672, "rewards/rejected": -15.879219055175781, "step": 5800 }, { "epoch": 1.17, "grad_norm": 0.001678466796875, "learning_rate": 3.8252914990899695e-06, "logits/chosen": -1.573280692100525, "logits/rejected": -2.7242345809936523, "logps/chosen": -169.05499267578125, "logps/rejected": -322.07501220703125, "loss": 0.0726, "rewards/accuracies": 0.8949999809265137, "rewards/chosen": -0.01105346716940403, "rewards/margins": 15.53531265258789, "rewards/rejected": -15.5521879196167, "step": 5850 }, { "epoch": 1.18, "grad_norm": 0.01416015625, "learning_rate": 3.800537212691651e-06, "logits/chosen": -1.745273470878601, "logits/rejected": -2.6969921588897705, "logps/chosen": -178.67250061035156, "logps/rejected": -330.9800109863281, "loss": 0.0812, "rewards/accuracies": 0.8824999928474426, "rewards/chosen": -0.30939698219299316, "rewards/margins": 15.374062538146973, "rewards/rejected": -15.685625076293945, "step": 5900 }, { "epoch": 1.19, "grad_norm": 0.000164031982421875, "learning_rate": 3.775606854373115e-06, "logits/chosen": -1.655666470527649, "logits/rejected": -2.6190404891967773, "logps/chosen": -174.1925048828125, "logps/rejected": -326.572509765625, "loss": 0.0726, "rewards/accuracies": 0.8949999809265137, "rewards/chosen": 0.033031005412340164, "rewards/margins": 15.436562538146973, "rewards/rejected": -15.404999732971191, "step": 5950 }, { "epoch": 1.2, "grad_norm": 0.546875, "learning_rate": 3.7505037993058046e-06, "logits/chosen": -1.7215648889541626, "logits/rejected": -2.621614933013916, "logps/chosen": -175.375, "logps/rejected": -334.135009765625, "loss": 0.0847, "rewards/accuracies": 0.8774999976158142, "rewards/chosen": -0.3496679663658142, "rewards/margins": 16.133594512939453, "rewards/rejected": -16.4854679107666, "step": 6000 }, { "epoch": 1.21, "grad_norm": 1.0859375, "learning_rate": 3.7252314460415396e-06, "logits/chosen": -1.5741479396820068, "logits/rejected": -2.679941415786743, "logps/chosen": -172.75250244140625, "logps/rejected": -326.0625, "loss": 0.0743, "rewards/accuracies": 0.8924999833106995, "rewards/chosen": 0.0013964843237772584, "rewards/margins": 15.612030982971191, "rewards/rejected": -15.608750343322754, "step": 6050 }, { "epoch": 1.22, "grad_norm": 2.3245811462402344e-05, "learning_rate": 3.6997932160524018e-06, "logits/chosen": -1.6118944883346558, "logits/rejected": -2.7812891006469727, "logps/chosen": -174.97500610351562, "logps/rejected": -325.45001220703125, "loss": 0.0778, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.1942773461341858, "rewards/margins": 15.537187576293945, "rewards/rejected": -15.733750343322754, "step": 6100 }, { "epoch": 1.23, "grad_norm": 6.961822509765625e-05, "learning_rate": 3.6741925532675297e-06, "logits/chosen": -1.480682373046875, "logits/rejected": -2.7474658489227295, "logps/chosen": -165.65750122070312, "logps/rejected": -317.760009765625, "loss": 0.0726, "rewards/accuracies": 0.8949999809265137, "rewards/chosen": 0.29445311427116394, "rewards/margins": 15.585000038146973, "rewards/rejected": -15.289531707763672, "step": 6150 }, { "epoch": 1.24, "grad_norm": 0.0042724609375, "learning_rate": 3.648432923606862e-06, "logits/chosen": -1.3990026712417603, "logits/rejected": -2.915020704269409, "logps/chosen": -164.90750122070312, "logps/rejected": -323.4700012207031, "loss": 0.0795, "rewards/accuracies": 0.8849999904632568, "rewards/chosen": 0.8015918135643005, "rewards/margins": 16.133438110351562, "rewards/rejected": -15.328437805175781, "step": 6200 }, { "epoch": 1.25, "grad_norm": 0.00024127960205078125, "learning_rate": 3.622517814511906e-06, "logits/chosen": -1.320135474205017, "logits/rejected": -2.860288143157959, "logps/chosen": -163.79750061035156, "logps/rejected": -321.0450134277344, "loss": 0.0916, "rewards/accuracies": 0.8675000071525574, "rewards/chosen": 0.6940441727638245, "rewards/margins": 16.037656784057617, "rewards/rejected": -15.3423433303833, "step": 6250 }, { "epoch": 1.26, "grad_norm": 0.98828125, "learning_rate": 3.5964507344735965e-06, "logits/chosen": -1.176222324371338, "logits/rejected": -2.6737987995147705, "logps/chosen": -160.44500732421875, "logps/rejected": -313.2349853515625, "loss": 0.1003, "rewards/accuracies": 0.8550000190734863, "rewards/chosen": 0.5672607421875, "rewards/margins": 15.430312156677246, "rewards/rejected": -14.865625381469727, "step": 6300 }, { "epoch": 1.27, "grad_norm": 0.000461578369140625, "learning_rate": 3.5702352125573015e-06, "logits/chosen": -1.2549536228179932, "logits/rejected": -2.675966739654541, "logps/chosen": -159.0675048828125, "logps/rejected": -311.6025085449219, "loss": 0.0882, "rewards/accuracies": 0.8725000023841858, "rewards/chosen": 0.6797509789466858, "rewards/margins": 15.3959379196167, "rewards/rejected": -14.717031478881836, "step": 6350 }, { "epoch": 1.28, "grad_norm": 0.0001392364501953125, "learning_rate": 3.543874797925042e-06, "logits/chosen": -1.5146979093551636, "logits/rejected": -2.7764551639556885, "logps/chosen": -170.92750549316406, "logps/rejected": -323.7449951171875, "loss": 0.0847, "rewards/accuracies": 0.8774999976158142, "rewards/chosen": 0.3626416027545929, "rewards/margins": 15.621562957763672, "rewards/rejected": -15.259843826293945, "step": 6400 }, { "epoch": 1.29, "grad_norm": 0.0126953125, "learning_rate": 3.5173730593549947e-06, "logits/chosen": -1.8136059045791626, "logits/rejected": -2.790639638900757, "logps/chosen": -169.0, "logps/rejected": -316.1000061035156, "loss": 0.0812, "rewards/accuracies": 0.8824999928474426, "rewards/chosen": 0.23876464366912842, "rewards/margins": 14.961406707763672, "rewards/rejected": -14.722187042236328, "step": 6450 }, { "epoch": 1.3, "grad_norm": 0.00799560546875, "learning_rate": 3.4907335847583356e-06, "logits/chosen": -1.9238842725753784, "logits/rejected": -2.8575196266174316, "logps/chosen": -170.00999450683594, "logps/rejected": -319.3324890136719, "loss": 0.1003, "rewards/accuracies": 0.8550000190734863, "rewards/chosen": -0.1691601574420929, "rewards/margins": 15.154375076293945, "rewards/rejected": -15.327031135559082, "step": 6500 }, { "epoch": 1.31, "grad_norm": 0.0074462890625, "learning_rate": 3.463959980693492e-06, "logits/chosen": -1.840087890625, "logits/rejected": -2.790390729904175, "logps/chosen": -169.75750732421875, "logps/rejected": -326.3450012207031, "loss": 0.0709, "rewards/accuracies": 0.8974999785423279, "rewards/chosen": 0.2927929759025574, "rewards/margins": 15.926562309265137, "rewards/rejected": -15.626562118530273, "step": 6550 }, { "epoch": 1.32, "grad_norm": 6.341934204101562e-05, "learning_rate": 3.4370558718778753e-06, "logits/chosen": -1.810239315032959, "logits/rejected": -2.8067736625671387, "logps/chosen": -168.5050048828125, "logps/rejected": -325.92498779296875, "loss": 0.0726, "rewards/accuracies": 0.8949999809265137, "rewards/chosen": 0.42659667134284973, "rewards/margins": 16.05078125, "rewards/rejected": -15.6264066696167, "step": 6600 }, { "epoch": 1.33, "grad_norm": 0.0002422332763671875, "learning_rate": 3.4100249006971514e-06, "logits/chosen": -2.036135196685791, "logits/rejected": -2.8965821266174316, "logps/chosen": -168.71499633789062, "logps/rejected": -319.447509765625, "loss": 0.0812, "rewards/accuracies": 0.8824999928474426, "rewards/chosen": 0.32066404819488525, "rewards/margins": 15.444375038146973, "rewards/rejected": -15.124062538146973, "step": 6650 }, { "epoch": 1.34, "grad_norm": 0.00063323974609375, "learning_rate": 3.3828707267121185e-06, "logits/chosen": -1.9675488471984863, "logits/rejected": -2.8052685260772705, "logps/chosen": -165.78250122070312, "logps/rejected": -315.56500244140625, "loss": 0.0882, "rewards/accuracies": 0.8725000023841858, "rewards/chosen": 0.36207762360572815, "rewards/margins": 15.29671859741211, "rewards/rejected": -14.934218406677246, "step": 6700 }, { "epoch": 1.35, "grad_norm": 1.15625, "learning_rate": 3.355597026163264e-06, "logits/chosen": -1.784468412399292, "logits/rejected": -2.6825194358825684, "logps/chosen": -162.88250732421875, "logps/rejected": -311.5199890136719, "loss": 0.1037, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.07732421904802322, "rewards/margins": 15.029531478881836, "rewards/rejected": -14.949531555175781, "step": 6750 }, { "epoch": 1.3599999999999999, "grad_norm": 0.8828125, "learning_rate": 3.3282074914730577e-06, "logits/chosen": -1.9187493324279785, "logits/rejected": -2.836562395095825, "logps/chosen": -158.41000366210938, "logps/rejected": -310.2674865722656, "loss": 0.0726, "rewards/accuracies": 0.8949999809265137, "rewards/chosen": 1.508481502532959, "rewards/margins": 15.624530792236328, "rewards/rejected": -14.117656707763672, "step": 6800 }, { "epoch": 1.37, "grad_norm": 0.0068359375, "learning_rate": 3.300705830746057e-06, "logits/chosen": -1.7984619140625, "logits/rejected": -2.8072216510772705, "logps/chosen": -147.9375, "logps/rejected": -300.1875, "loss": 0.0743, "rewards/accuracies": 0.8924999833106995, "rewards/chosen": 2.547187566757202, "rewards/margins": 15.463281631469727, "rewards/rejected": -12.914375305175781, "step": 6850 }, { "epoch": 1.38, "grad_norm": 0.01104736328125, "learning_rate": 3.2730957672668917e-06, "logits/chosen": -1.6440536975860596, "logits/rejected": -2.777017593383789, "logps/chosen": -143.3625030517578, "logps/rejected": -295.2699890136719, "loss": 0.0726, "rewards/accuracies": 0.8949999809265137, "rewards/chosen": 2.895751953125, "rewards/margins": 15.431718826293945, "rewards/rejected": -12.537031173706055, "step": 6900 }, { "epoch": 1.3900000000000001, "grad_norm": 0.0013580322265625, "learning_rate": 3.245381038996188e-06, "logits/chosen": -1.747806429862976, "logits/rejected": -2.857853889465332, "logps/chosen": -139.8975067138672, "logps/rejected": -288.42999267578125, "loss": 0.0795, "rewards/accuracies": 0.8849999904632568, "rewards/chosen": 3.0215137004852295, "rewards/margins": 15.106406211853027, "rewards/rejected": -12.087187767028809, "step": 6950 }, { "epoch": 1.4, "grad_norm": 0.0035247802734375, "learning_rate": 3.2175653980645096e-06, "logits/chosen": -1.6131787300109863, "logits/rejected": -2.7717015743255615, "logps/chosen": -134.77499389648438, "logps/rejected": -281.7250061035156, "loss": 0.0916, "rewards/accuracies": 0.8675000071525574, "rewards/chosen": 3.087773323059082, "rewards/margins": 14.971718788146973, "rewards/rejected": -11.884687423706055, "step": 7000 }, { "epoch": 1.41, "grad_norm": 0.003173828125, "learning_rate": 3.189652610264379e-06, "logits/chosen": -1.6364532709121704, "logits/rejected": -2.7611522674560547, "logps/chosen": -148.1074981689453, "logps/rejected": -292.260009765625, "loss": 0.0985, "rewards/accuracies": 0.8575000166893005, "rewards/chosen": 2.152085065841675, "rewards/margins": 14.622968673706055, "rewards/rejected": -12.472969055175781, "step": 7050 }, { "epoch": 1.42, "grad_norm": 0.00909423828125, "learning_rate": 3.1616464545404486e-06, "logits/chosen": -1.6323034763336182, "logits/rejected": -2.8021483421325684, "logps/chosen": -152.21749877929688, "logps/rejected": -307.4024963378906, "loss": 0.0709, "rewards/accuracies": 0.8974999785423279, "rewards/chosen": 2.286259651184082, "rewards/margins": 15.906719207763672, "rewards/rejected": -13.616406440734863, "step": 7100 }, { "epoch": 1.43, "grad_norm": 0.00014591217041015625, "learning_rate": 3.133550722477896e-06, "logits/chosen": -1.5869457721710205, "logits/rejected": -2.831690788269043, "logps/chosen": -151.6699981689453, "logps/rejected": -312.9800109863281, "loss": 0.0709, "rewards/accuracies": 0.8974999785423279, "rewards/chosen": 2.021599054336548, "rewards/margins": 16.44562530517578, "rewards/rejected": -14.4232816696167, "step": 7150 }, { "epoch": 1.44, "grad_norm": 0.0057373046875, "learning_rate": 3.105369217789099e-06, "logits/chosen": -1.5832836627960205, "logits/rejected": -2.826430559158325, "logps/chosen": -147.8800048828125, "logps/rejected": -299.2049865722656, "loss": 0.102, "rewards/accuracies": 0.8525000214576721, "rewards/chosen": 1.6627343893051147, "rewards/margins": 15.454687118530273, "rewards/rejected": -13.7876558303833, "step": 7200 }, { "epoch": 1.45, "grad_norm": 1.3359375, "learning_rate": 3.077105755798675e-06, "logits/chosen": -1.5947095155715942, "logits/rejected": -2.803586483001709, "logps/chosen": -149.30250549316406, "logps/rejected": -300.635009765625, "loss": 0.1037, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.6690233945846558, "rewards/margins": 15.402812957763672, "rewards/rejected": -13.731562614440918, "step": 7250 }, { "epoch": 1.46, "grad_norm": 0.0003814697265625, "learning_rate": 3.0487641629269515e-06, "logits/chosen": -1.3740723133087158, "logits/rejected": -2.788759708404541, "logps/chosen": -149.9949951171875, "logps/rejected": -307.3275146484375, "loss": 0.0864, "rewards/accuracies": 0.875, "rewards/chosen": 1.7442187070846558, "rewards/margins": 15.862968444824219, "rewards/rejected": -14.121718406677246, "step": 7300 }, { "epoch": 1.47, "grad_norm": 0.6015625, "learning_rate": 3.0203482761719226e-06, "logits/chosen": -1.3477171659469604, "logits/rejected": -2.810493230819702, "logps/chosen": -152.57249450683594, "logps/rejected": -303.9750061035156, "loss": 0.1054, "rewards/accuracies": 0.8475000262260437, "rewards/chosen": 1.5104199647903442, "rewards/margins": 15.333281517028809, "rewards/rejected": -13.822187423706055, "step": 7350 }, { "epoch": 1.48, "grad_norm": 0.59375, "learning_rate": 2.991861942589788e-06, "logits/chosen": -1.4101388454437256, "logits/rejected": -2.9320101737976074, "logps/chosen": -151.7100067138672, "logps/rejected": -306.79998779296875, "loss": 0.0761, "rewards/accuracies": 0.8899999856948853, "rewards/chosen": 2.018501043319702, "rewards/margins": 15.747187614440918, "rewards/rejected": -13.726093292236328, "step": 7400 }, { "epoch": 1.49, "grad_norm": 4.76837158203125e-05, "learning_rate": 2.9633090187741186e-06, "logits/chosen": -1.3069872856140137, "logits/rejected": -2.8863134384155273, "logps/chosen": -145.5574951171875, "logps/rejected": -290.4125061035156, "loss": 0.1037, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.1462109088897705, "rewards/margins": 14.840781211853027, "rewards/rejected": -12.689844131469727, "step": 7450 }, { "epoch": 1.5, "grad_norm": 0.0001659393310546875, "learning_rate": 2.934693370333739e-06, "logits/chosen": -1.3929543495178223, "logits/rejected": -2.97314453125, "logps/chosen": -147.5225067138672, "logps/rejected": -305.05999755859375, "loss": 0.0709, "rewards/accuracies": 0.8974999785423279, "rewards/chosen": 2.6558544635772705, "rewards/margins": 15.836562156677246, "rewards/rejected": -13.185937881469727, "step": 7500 }, { "epoch": 1.51, "grad_norm": 0.98828125, "learning_rate": 2.9060188713693794e-06, "logits/chosen": -1.3370373249053955, "logits/rejected": -2.9380431175231934, "logps/chosen": -141.3574981689453, "logps/rejected": -294.44500732421875, "loss": 0.0761, "rewards/accuracies": 0.8899999856948853, "rewards/chosen": 2.661679744720459, "rewards/margins": 15.43765640258789, "rewards/rejected": -12.771562576293945, "step": 7550 }, { "epoch": 1.52, "grad_norm": 0.78515625, "learning_rate": 2.8772894039491938e-06, "logits/chosen": -1.4408868551254272, "logits/rejected": -2.962076425552368, "logps/chosen": -142.13999938964844, "logps/rejected": -295.55999755859375, "loss": 0.0847, "rewards/accuracies": 0.8774999976158142, "rewards/chosen": 2.679374933242798, "rewards/margins": 15.471875190734863, "rewards/rejected": -12.795391082763672, "step": 7600 }, { "epoch": 1.53, "grad_norm": 0.01397705078125, "learning_rate": 2.848508857583183e-06, "logits/chosen": -1.521622896194458, "logits/rejected": -3.0157811641693115, "logps/chosen": -142.84500122070312, "logps/rejected": -294.93499755859375, "loss": 0.0864, "rewards/accuracies": 0.875, "rewards/chosen": 2.692207098007202, "rewards/margins": 15.400468826293945, "rewards/rejected": -12.703906059265137, "step": 7650 }, { "epoch": 1.54, "grad_norm": 0.0037689208984375, "learning_rate": 2.81968112869662e-06, "logits/chosen": -1.334303379058838, "logits/rejected": -2.9068212509155273, "logps/chosen": -137.8925018310547, "logps/rejected": -280.385009765625, "loss": 0.1037, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 2.869335889816284, "rewards/margins": 14.509531021118164, "rewards/rejected": -11.638280868530273, "step": 7700 }, { "epoch": 1.55, "grad_norm": 1.1015625, "learning_rate": 2.790810120102534e-06, "logits/chosen": -1.453762173652649, "logits/rejected": -2.9707226753234863, "logps/chosen": -146.41000366210938, "logps/rejected": -292.67498779296875, "loss": 0.083, "rewards/accuracies": 0.8799999952316284, "rewards/chosen": 2.3908984661102295, "rewards/margins": 14.83578109741211, "rewards/rejected": -12.446874618530273, "step": 7750 }, { "epoch": 1.56, "grad_norm": 0.00040435791015625, "learning_rate": 2.7618997404733365e-06, "logits/chosen": -1.6041711568832397, "logits/rejected": -2.9843311309814453, "logps/chosen": -150.46749877929688, "logps/rejected": -298.6700134277344, "loss": 0.0847, "rewards/accuracies": 0.8774999976158142, "rewards/chosen": 2.042372941970825, "rewards/margins": 15.013437271118164, "rewards/rejected": -12.966405868530273, "step": 7800 }, { "epoch": 1.5699999999999998, "grad_norm": 0.0004024505615234375, "learning_rate": 2.7329539038116453e-06, "logits/chosen": -1.4521881341934204, "logits/rejected": -2.926464796066284, "logps/chosen": -150.1074981689453, "logps/rejected": -298.0425109863281, "loss": 0.0795, "rewards/accuracies": 0.8849999904632568, "rewards/chosen": 2.3667969703674316, "rewards/margins": 14.889687538146973, "rewards/rejected": -12.527030944824219, "step": 7850 }, { "epoch": 1.58, "grad_norm": 0.8671875, "learning_rate": 2.7039765289203947e-06, "logits/chosen": -1.412445068359375, "logits/rejected": -2.8667969703674316, "logps/chosen": -146.1300048828125, "logps/rejected": -291.2850036621094, "loss": 0.0812, "rewards/accuracies": 0.8824999928474426, "rewards/chosen": 2.3307225704193115, "rewards/margins": 14.735937118530273, "rewards/rejected": -12.403437614440918, "step": 7900 }, { "epoch": 1.5899999999999999, "grad_norm": 0.67578125, "learning_rate": 2.6749715388722865e-06, "logits/chosen": -1.2646269798278809, "logits/rejected": -2.821697473526001, "logps/chosen": -145.84750366210938, "logps/rejected": -289.67498779296875, "loss": 0.0899, "rewards/accuracies": 0.8700000047683716, "rewards/chosen": 2.281503915786743, "rewards/margins": 14.5806245803833, "rewards/rejected": -12.300156593322754, "step": 7950 }, { "epoch": 1.6, "grad_norm": 0.0015411376953125, "learning_rate": 2.6459428604786757e-06, "logits/chosen": -1.311349630355835, "logits/rejected": -2.7577929496765137, "logps/chosen": -150.4425048828125, "logps/rejected": -293.7149963378906, "loss": 0.0985, "rewards/accuracies": 0.8575000166893005, "rewards/chosen": 1.655849575996399, "rewards/margins": 14.475312232971191, "rewards/rejected": -12.818437576293945, "step": 8000 }, { "epoch": 1.6099999999999999, "grad_norm": 0.00147247314453125, "learning_rate": 2.616894423757941e-06, "logits/chosen": -1.4159927368164062, "logits/rejected": -2.9021289348602295, "logps/chosen": -151.3925018310547, "logps/rejected": -296.8399963378906, "loss": 0.0795, "rewards/accuracies": 0.8849999904632568, "rewards/chosen": 2.0074121952056885, "rewards/margins": 14.760781288146973, "rewards/rejected": -12.754219055175781, "step": 8050 }, { "epoch": 1.62, "grad_norm": 0.000164031982421875, "learning_rate": 2.587830161403419e-06, "logits/chosen": -1.3917722702026367, "logits/rejected": -2.887305974960327, "logps/chosen": -152.03500366210938, "logps/rejected": -288.260009765625, "loss": 0.0916, "rewards/accuracies": 0.8675000071525574, "rewards/chosen": 1.9093506336212158, "rewards/margins": 13.984375, "rewards/rejected": -12.0795316696167, "step": 8100 }, { "epoch": 1.63, "grad_norm": 0.90625, "learning_rate": 2.5587540082509864e-06, "logits/chosen": -1.3086663484573364, "logits/rejected": -2.910728693008423, "logps/chosen": -148.97250366210938, "logps/rejected": -287.9200134277344, "loss": 0.0795, "rewards/accuracies": 0.8849999904632568, "rewards/chosen": 2.129091739654541, "rewards/margins": 14.16812515258789, "rewards/rejected": -12.041093826293945, "step": 8150 }, { "epoch": 1.6400000000000001, "grad_norm": 0.0361328125, "learning_rate": 2.5296699007463434e-06, "logits/chosen": -1.198205590248108, "logits/rejected": -2.7841992378234863, "logps/chosen": -147.5749969482422, "logps/rejected": -290.5050048828125, "loss": 0.0657, "rewards/accuracies": 0.9049999713897705, "rewards/chosen": 2.2004687786102295, "rewards/margins": 14.422344207763672, "rewards/rejected": -12.222969055175781, "step": 8200 }, { "epoch": 1.65, "grad_norm": 0.0004787445068359375, "learning_rate": 2.500581776412081e-06, "logits/chosen": -1.3178759813308716, "logits/rejected": -2.763364315032959, "logps/chosen": -153.17999267578125, "logps/rejected": -296.3699951171875, "loss": 0.083, "rewards/accuracies": 0.8799999952316284, "rewards/chosen": 1.7891894578933716, "rewards/margins": 14.428750038146973, "rewards/rejected": -12.639843940734863, "step": 8250 }, { "epoch": 1.6600000000000001, "grad_norm": 0.7578125, "learning_rate": 2.471493573314605e-06, "logits/chosen": -1.273976445198059, "logits/rejected": -2.638751745223999, "logps/chosen": -147.61749267578125, "logps/rejected": -281.2300109863281, "loss": 0.1089, "rewards/accuracies": 0.8424999713897705, "rewards/chosen": 1.724277377128601, "rewards/margins": 13.661250114440918, "rewards/rejected": -11.94156265258789, "step": 8300 }, { "epoch": 1.67, "grad_norm": 0.5234375, "learning_rate": 2.442409229530985e-06, "logits/chosen": -1.3456776142120361, "logits/rejected": -2.6645333766937256, "logps/chosen": -148.5449981689453, "logps/rejected": -290.0400085449219, "loss": 0.0968, "rewards/accuracies": 0.8600000143051147, "rewards/chosen": 1.7859375476837158, "rewards/margins": 14.419530868530273, "rewards/rejected": -12.630781173706055, "step": 8350 }, { "epoch": 1.6800000000000002, "grad_norm": 1.046875, "learning_rate": 2.4133326826158006e-06, "logits/chosen": -1.452473759651184, "logits/rejected": -2.719834089279175, "logps/chosen": -151.13999938964844, "logps/rejected": -297.2650146484375, "loss": 0.0847, "rewards/accuracies": 0.8774999976158142, "rewards/chosen": 1.8502148389816284, "rewards/margins": 14.8579683303833, "rewards/rejected": -13.008437156677246, "step": 8400 }, { "epoch": 1.69, "grad_norm": 0.0001697540283203125, "learning_rate": 2.3842678690680612e-06, "logits/chosen": -1.2888891696929932, "logits/rejected": -2.6560449600219727, "logps/chosen": -150.96499633789062, "logps/rejected": -290.1199951171875, "loss": 0.0968, "rewards/accuracies": 0.8600000143051147, "rewards/chosen": 1.7348560094833374, "rewards/margins": 14.02734375, "rewards/rejected": -12.290937423706055, "step": 8450 }, { "epoch": 1.7, "grad_norm": 0.00016498565673828125, "learning_rate": 2.355218723798264e-06, "logits/chosen": -1.3379980325698853, "logits/rejected": -2.7865869998931885, "logps/chosen": -153.50999450683594, "logps/rejected": -295.2149963378906, "loss": 0.0778, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 2.0467870235443115, "rewards/margins": 14.391094207763672, "rewards/rejected": -12.346562385559082, "step": 8500 }, { "epoch": 1.71, "grad_norm": 1.0, "learning_rate": 2.326189179595676e-06, "logits/chosen": -1.2621526718139648, "logits/rejected": -2.6848363876342773, "logps/chosen": -151.13999938964844, "logps/rejected": -294.8949890136719, "loss": 0.0812, "rewards/accuracies": 0.8824999928474426, "rewards/chosen": 2.0117383003234863, "rewards/margins": 14.526562690734863, "rewards/rejected": -12.5053129196167, "step": 8550 }, { "epoch": 1.72, "grad_norm": 0.0203857421875, "learning_rate": 2.297183166595889e-06, "logits/chosen": -1.3091638088226318, "logits/rejected": -2.7887303829193115, "logps/chosen": -155.8625030517578, "logps/rejected": -300.9750061035156, "loss": 0.0588, "rewards/accuracies": 0.9150000214576721, "rewards/chosen": 2.133901357650757, "rewards/margins": 14.717499732971191, "rewards/rejected": -12.588281631469727, "step": 8600 }, { "epoch": 1.73, "grad_norm": 1.25, "learning_rate": 2.26820461174875e-06, "logits/chosen": -1.2939709424972534, "logits/rejected": -2.6905393600463867, "logps/chosen": -148.60499572753906, "logps/rejected": -286.3699951171875, "loss": 0.0933, "rewards/accuracies": 0.8650000095367432, "rewards/chosen": 1.790956974029541, "rewards/margins": 13.9829683303833, "rewards/rejected": -12.193750381469727, "step": 8650 }, { "epoch": 1.74, "grad_norm": 0.93359375, "learning_rate": 2.23925743828671e-06, "logits/chosen": -1.1340380907058716, "logits/rejected": -2.585568904876709, "logps/chosen": -146.1999969482422, "logps/rejected": -279.0899963378906, "loss": 0.1037, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 1.663818359375, "rewards/margins": 13.498906135559082, "rewards/rejected": -11.834218978881836, "step": 8700 }, { "epoch": 1.75, "grad_norm": 0.00543212890625, "learning_rate": 2.2103455651936824e-06, "logits/chosen": -1.1339178085327148, "logits/rejected": -2.6299610137939453, "logps/chosen": -146.9774932861328, "logps/rejected": -285.3175048828125, "loss": 0.0968, "rewards/accuracies": 0.8600000143051147, "rewards/chosen": 1.9546289443969727, "rewards/margins": 14.03499984741211, "rewards/rejected": -12.078594207763672, "step": 8750 }, { "epoch": 1.76, "grad_norm": 0.005828857421875, "learning_rate": 2.181472906674478e-06, "logits/chosen": -1.313624382019043, "logits/rejected": -2.7783007621765137, "logps/chosen": -154.29750061035156, "logps/rejected": -303.6675109863281, "loss": 0.0553, "rewards/accuracies": 0.9200000166893005, "rewards/chosen": 2.0906200408935547, "rewards/margins": 15.23062515258789, "rewards/rejected": -13.136249542236328, "step": 8800 }, { "epoch": 1.77, "grad_norm": 0.00531005859375, "learning_rate": 2.152643371624878e-06, "logits/chosen": -1.257965087890625, "logits/rejected": -2.8671875, "logps/chosen": -148.28500366210938, "logps/rejected": -293.7850036621094, "loss": 0.0743, "rewards/accuracies": 0.8924999833106995, "rewards/chosen": 2.257265567779541, "rewards/margins": 14.864843368530273, "rewards/rejected": -12.609844207763672, "step": 8850 }, { "epoch": 1.78, "grad_norm": 0.005889892578125, "learning_rate": 2.1238608631024416e-06, "logits/chosen": -1.217246651649475, "logits/rejected": -2.843193292617798, "logps/chosen": -151.90249633789062, "logps/rejected": -294.79998779296875, "loss": 0.083, "rewards/accuracies": 0.8799999952316284, "rewards/chosen": 1.9859668016433716, "rewards/margins": 14.599687576293945, "rewards/rejected": -12.60953140258789, "step": 8900 }, { "epoch": 1.79, "grad_norm": 0.79296875, "learning_rate": 2.095129277798084e-06, "logits/chosen": -1.2338311672210693, "logits/rejected": -2.8333849906921387, "logps/chosen": -149.3524932861328, "logps/rejected": -293.375, "loss": 0.0951, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 2.1244139671325684, "rewards/margins": 14.682812690734863, "rewards/rejected": -12.559218406677246, "step": 8950 }, { "epoch": 1.8, "grad_norm": 0.00011968612670898438, "learning_rate": 2.0664525055085353e-06, "logits/chosen": -1.25933837890625, "logits/rejected": -2.8884618282318115, "logps/chosen": -149.42750549316406, "logps/rejected": -296.8999938964844, "loss": 0.0916, "rewards/accuracies": 0.8675000071525574, "rewards/chosen": 2.1980907917022705, "rewards/margins": 14.94124984741211, "rewards/rejected": -12.743906021118164, "step": 9000 }, { "epoch": 1.81, "grad_norm": 0.005859375, "learning_rate": 2.037834428609718e-06, "logits/chosen": -1.220767855644226, "logits/rejected": -2.8833789825439453, "logps/chosen": -150.5800018310547, "logps/rejected": -299.7900085449219, "loss": 0.0709, "rewards/accuracies": 0.8974999785423279, "rewards/chosen": 2.273066520690918, "rewards/margins": 15.19124984741211, "rewards/rejected": -12.915781021118164, "step": 9050 }, { "epoch": 1.8199999999999998, "grad_norm": 1.265625, "learning_rate": 2.009278921531141e-06, "logits/chosen": -1.1782631874084473, "logits/rejected": -2.8433496952056885, "logps/chosen": -148.50250244140625, "logps/rejected": -299.1025085449219, "loss": 0.0743, "rewards/accuracies": 0.8924999833106995, "rewards/chosen": 2.413759708404541, "rewards/margins": 15.21875, "rewards/rejected": -12.807656288146973, "step": 9100 }, { "epoch": 1.83, "grad_norm": 0.0002918243408203125, "learning_rate": 1.9807898502313577e-06, "logits/chosen": -1.0487598180770874, "logits/rejected": -2.6698436737060547, "logps/chosen": -143.33250427246094, "logps/rejected": -277.25, "loss": 0.1227, "rewards/accuracies": 0.8224999904632568, "rewards/chosen": 1.6386914253234863, "rewards/margins": 13.599374771118164, "rewards/rejected": -11.961718559265137, "step": 9150 }, { "epoch": 1.8399999999999999, "grad_norm": 0.00640869140625, "learning_rate": 1.9523710716745846e-06, "logits/chosen": -1.1239213943481445, "logits/rejected": -2.814990282058716, "logps/chosen": -145.6374969482422, "logps/rejected": -289.4549865722656, "loss": 0.0916, "rewards/accuracies": 0.8675000071525574, "rewards/chosen": 2.229843854904175, "rewards/margins": 14.695937156677246, "rewards/rejected": -12.4609375, "step": 9200 }, { "epoch": 1.85, "grad_norm": 1.1328125, "learning_rate": 1.9240264333085247e-06, "logits/chosen": -1.1269491910934448, "logits/rejected": -2.812253475189209, "logps/chosen": -142.22000122070312, "logps/rejected": -289.4075012207031, "loss": 0.0882, "rewards/accuracies": 0.8725000023841858, "rewards/chosen": 2.428847551345825, "rewards/margins": 14.7670316696167, "rewards/rejected": -12.332500457763672, "step": 9250 }, { "epoch": 1.8599999999999999, "grad_norm": 0.0002460479736328125, "learning_rate": 1.8957597725434814e-06, "logits/chosen": -1.052548885345459, "logits/rejected": -2.777416944503784, "logps/chosen": -144.0449981689453, "logps/rejected": -290.26251220703125, "loss": 0.083, "rewards/accuracies": 0.8799999952316284, "rewards/chosen": 2.608515739440918, "rewards/margins": 14.833125114440918, "rewards/rejected": -12.229687690734863, "step": 9300 }, { "epoch": 1.87, "grad_norm": 0.000820159912109375, "learning_rate": 1.8675749162328472e-06, "logits/chosen": -0.9933964610099792, "logits/rejected": -2.699892520904541, "logps/chosen": -144.44000244140625, "logps/rejected": -286.8374938964844, "loss": 0.0882, "rewards/accuracies": 0.8725000023841858, "rewards/chosen": 2.7115039825439453, "rewards/margins": 14.4439058303833, "rewards/rejected": -11.732656478881836, "step": 9350 }, { "epoch": 1.88, "grad_norm": 1.1484375, "learning_rate": 1.839475680154994e-06, "logits/chosen": -0.9599626064300537, "logits/rejected": -2.6507763862609863, "logps/chosen": -139.96249389648438, "logps/rejected": -276.3900146484375, "loss": 0.1106, "rewards/accuracies": 0.8399999737739563, "rewards/chosen": 2.6600000858306885, "rewards/margins": 13.848750114440918, "rewards/rejected": -11.1899995803833, "step": 9400 }, { "epoch": 1.8900000000000001, "grad_norm": 0.0001621246337890625, "learning_rate": 1.8114658684966893e-06, "logits/chosen": -0.9667956829071045, "logits/rejected": -2.65411376953125, "logps/chosen": -140.2100067138672, "logps/rejected": -285.072509765625, "loss": 0.0812, "rewards/accuracies": 0.8824999928474426, "rewards/chosen": 2.996386766433716, "rewards/margins": 14.584375381469727, "rewards/rejected": -11.588281631469727, "step": 9450 }, { "epoch": 1.9, "grad_norm": 0.00021076202392578125, "learning_rate": 1.7835492733380621e-06, "logits/chosen": -0.9071944952011108, "logits/rejected": -2.6002306938171387, "logps/chosen": -140.1374969482422, "logps/rejected": -280.19000244140625, "loss": 0.0951, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 2.7462353706359863, "rewards/margins": 14.235312461853027, "rewards/rejected": -11.487500190734863, "step": 9500 }, { "epoch": 1.9100000000000001, "grad_norm": 8.487701416015625e-05, "learning_rate": 1.755729674139224e-06, "logits/chosen": -0.9953001141548157, "logits/rejected": -2.6695971488952637, "logps/chosen": -138.27499389648438, "logps/rejected": -274.2049865722656, "loss": 0.1106, "rewards/accuracies": 0.8399999737739563, "rewards/chosen": 2.5620312690734863, "rewards/margins": 13.859375, "rewards/rejected": -11.292655944824219, "step": 9550 }, { "epoch": 1.92, "grad_norm": 0.578125, "learning_rate": 1.7280108372285804e-06, "logits/chosen": -1.1185680627822876, "logits/rejected": -2.6991472244262695, "logps/chosen": -143.2100067138672, "logps/rejected": -288.0799865722656, "loss": 0.0847, "rewards/accuracies": 0.8774999976158142, "rewards/chosen": 2.851914167404175, "rewards/margins": 14.767969131469727, "rewards/rejected": -11.918749809265137, "step": 9600 }, { "epoch": 1.9300000000000002, "grad_norm": 1.53125, "learning_rate": 1.700396515292942e-06, "logits/chosen": -1.051497220993042, "logits/rejected": -2.6542186737060547, "logps/chosen": -140.2550048828125, "logps/rejected": -285.32000732421875, "loss": 0.0933, "rewards/accuracies": 0.8650000095367432, "rewards/chosen": 2.637402296066284, "rewards/margins": 14.5482816696167, "rewards/rejected": -11.911328315734863, "step": 9650 }, { "epoch": 1.94, "grad_norm": 0.0093994140625, "learning_rate": 1.67289044686946e-06, "logits/chosen": -1.104941725730896, "logits/rejected": -2.7016260623931885, "logps/chosen": -141.42750549316406, "logps/rejected": -286.2049865722656, "loss": 0.0812, "rewards/accuracies": 0.8824999928474426, "rewards/chosen": 2.8697266578674316, "rewards/margins": 14.7017183303833, "rewards/rejected": -11.830156326293945, "step": 9700 }, { "epoch": 1.95, "grad_norm": 0.0042724609375, "learning_rate": 1.6454963558394954e-06, "logits/chosen": -1.0008597373962402, "logits/rejected": -2.603630304336548, "logps/chosen": -137.0625, "logps/rejected": -274.5350036621094, "loss": 0.0916, "rewards/accuracies": 0.8675000071525574, "rewards/chosen": 2.792910099029541, "rewards/margins": 13.903437614440918, "rewards/rejected": -11.112500190734863, "step": 9750 }, { "epoch": 1.96, "grad_norm": 0.00189971923828125, "learning_rate": 1.6182179509244623e-06, "logits/chosen": -0.8819994926452637, "logits/rejected": -2.5757317543029785, "logps/chosen": -136.65750122070312, "logps/rejected": -267.739990234375, "loss": 0.1054, "rewards/accuracies": 0.8475000262260437, "rewards/chosen": 2.8574609756469727, "rewards/margins": 13.30062484741211, "rewards/rejected": -10.443046569824219, "step": 9800 }, { "epoch": 1.97, "grad_norm": 0.001617431640625, "learning_rate": 1.5910589251837258e-06, "logits/chosen": -0.9589361548423767, "logits/rejected": -2.654125928878784, "logps/chosen": -141.08250427246094, "logps/rejected": -274.427490234375, "loss": 0.0899, "rewards/accuracies": 0.8700000047683716, "rewards/chosen": 2.935683488845825, "rewards/margins": 13.803203582763672, "rewards/rejected": -10.866328239440918, "step": 9850 }, { "epoch": 1.98, "grad_norm": 0.000946044921875, "learning_rate": 1.5640229555146237e-06, "logits/chosen": -0.9323858618736267, "logits/rejected": -2.6372363567352295, "logps/chosen": -140.74000549316406, "logps/rejected": -276.75, "loss": 0.0882, "rewards/accuracies": 0.8725000023841858, "rewards/chosen": 3.178281307220459, "rewards/margins": 13.927734375, "rewards/rejected": -10.7514066696167, "step": 9900 }, { "epoch": 1.99, "grad_norm": 1.265625, "learning_rate": 1.537113702154668e-06, "logits/chosen": -1.0329382419586182, "logits/rejected": -2.759021520614624, "logps/chosen": -140.61500549316406, "logps/rejected": -277.2174987792969, "loss": 0.0847, "rewards/accuracies": 0.8774999976158142, "rewards/chosen": 2.995312452316284, "rewards/margins": 13.927968978881836, "rewards/rejected": -10.926405906677246, "step": 9950 }, { "epoch": 2.0, "grad_norm": 0.69140625, "learning_rate": 1.5103348081860159e-06, "logits/chosen": -0.9912976026535034, "logits/rejected": -2.705747127532959, "logps/chosen": -140.69749450683594, "logps/rejected": -283.8374938964844, "loss": 0.0709, "rewards/accuracies": 0.8974999785423279, "rewards/chosen": 3.1512889862060547, "rewards/margins": 14.588281631469727, "rewards/rejected": -11.444062232971191, "step": 10000 } ], "logging_steps": 50, "max_steps": 15000, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }